Skip to content

Commit fe8f9ab

Browse files
craig[bot]alyshanjahani-crlspilchen
committed
150743: contention: Increase RetryBudgetForMissingResult r=alyshanjahani-crl a=alyshanjahani-crl Previously the retry budget was set to 1, however this budget lead to a significant amount of failed resolutions. To see why a retry budget of 1 is not sufficient consider the case where an in progress transaction is in the writer buffer when resolution is attempted. The in progress txn is then ingested into the cache after the txn resolution endpoint drains the write buffer - i.e. it is stored in the cache with the appstatspb.InvalidTransactionFingerprintID value. Now the transaction finishes and its respective fingerprint ID is recorded. However, it is in the writer buffer of the txn id cache. When resolution is attempted again, the lookup gets the invalid / in-progress value that is stored in the cache. The subsequent flush then gets the cache to ingest the actual fingerprint ID value for the txn. But we've run out of budget, and don't retry resolution. This commit increases the budget to 2. In addition to handling the case above, experimentally it shows to lower the number of failed resolutions (see issue linked). Lastly, this commit removes dead code in the TxnID resolution endpoint. A map was being created and never added to. The logic resulted in the RPC flushing the TxnID Cache on every invocation, that behaviour is preserved and made more explicit. Fixes: #148686 Release note: None 151063: roachtest/ttl: fix TTL restart test flakiness r=spilchen a=spilchen The TTL restart test was experiencing flakiness due to the default stability window causing delays in replanning when nodes changed. The test would wait for TTL progress across all nodes but the replanning logic wouldn't trigger immediately when nodes were restarted. This change disables the stability window. This also fixes a bug in the logic that checks if the TTL job is progressing. It would look for key removal across all ranges over time. The existing check repeatedly change the baseline. We now save that the baseline and compare it with each check. Release note: None Epic: None Closes #151011 Co-authored-by: Alyshan Jahani <[email protected]> Co-authored-by: Matt Spilchen <[email protected]>
3 parents a6ca323 + 9f07b51 + 76238ee commit fe8f9ab

File tree

3 files changed

+69
-50
lines changed

3 files changed

+69
-50
lines changed

pkg/cmd/roachtest/tests/ttl_restart.go

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ func runTTLRestart(ctx context.Context, t test.Test, c cluster.Cluster, numResta
9393
// Speed up the test by doing the replan check often and with a low threshold.
9494
"SET CLUSTER SETTING sql.ttl.replan_flow_frequency = '15s'",
9595
"SET CLUSTER SETTING sql.ttl.replan_flow_threshold = '0.1'",
96+
// Disable the stability window to ensure immediate replanning on node changes.
97+
"SET CLUSTER SETTING sql.ttl.replan_stability_window = 1",
9698
// Add additional logging to help debug the test on failure.
9799
"SET CLUSTER SETTING server.debug.default_vmodule = 'ttljob_processor=1,distsql_plan_bulk=1'",
98100
// Create the schema to be used in the test
@@ -165,8 +167,13 @@ func runTTLRestart(ctx context.Context, t test.Test, c cluster.Cluster, numResta
165167
db = c.Conn(ctx, t.L(), jobInfo.CoordinatorID)
166168

167169
t.Status("wait for TTL deletions to start happening")
170+
// Take baseline once and reuse it for all progress checks
171+
baseline, err := takeProgressBaseline(ctx, t, db)
172+
if err != nil {
173+
return errors.Wrapf(err, "error taking TTL progress baseline")
174+
}
168175
waitForTTLProgressAcrossAllNodes := func() error {
169-
if err := waitForTTLProgressAcrossAllRanges(ctx, db); err != nil {
176+
if err := checkTTLProgressAgainstBaseline(ctx, db, baseline); err != nil {
170177
return errors.Wrapf(err, "error waiting for TTL progress after restart")
171178
}
172179
return nil
@@ -310,11 +317,11 @@ func distributeLeases(ctx context.Context, t test.Test, db *gosql.DB) error {
310317

311318
}
312319

313-
// waitForTTLProgressAcrossAllRanges ensures that TTL deletions are happening across
314-
// all ranges. It builds a baseline of key counts for each leaseholder's ranges,
315-
// and later checks that each leaseholder made progress on at least one of those ranges,
316-
// regardless of current leaseholder assignment.
317-
func waitForTTLProgressAcrossAllRanges(ctx context.Context, db *gosql.DB) error {
320+
// takeProgressBaseline captures the initial key counts for each range and its leaseholder.
321+
// This baseline will be used later to check if TTL progress is being made.
322+
func takeProgressBaseline(
323+
ctx context.Context, t test.Test, db *gosql.DB,
324+
) (map[int]map[int]int, error) {
318325
query := `
319326
WITH r AS (
320327
SHOW RANGES FROM TABLE ttldb.tab1 WITH DETAILS
@@ -337,59 +344,79 @@ func waitForTTLProgressAcrossAllRanges(ctx context.Context, db *gosql.DB) error
337344

338345
rows, err := db.QueryContext(ctx, query)
339346
if err != nil {
340-
return err
347+
return nil, err
341348
}
342349
defer rows.Close()
343350

344351
for rows.Next() {
345352
var rangeID, leaseHolder, keyCount int
346353
if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
347-
return err
354+
return nil, err
348355
}
349356
if _, ok := baseline[leaseHolder]; !ok {
350357
baseline[leaseHolder] = make(map[int]int)
351358
}
352359
baseline[leaseHolder][rangeID] = keyCount
353360
}
354361

355-
compareWithBaseline := func() error {
356-
current := make(map[int]int) // rangeID -> keyCount
362+
return baseline, nil
363+
}
357364

358-
rows, err := db.QueryContext(ctx, query)
359-
if err != nil {
360-
return err
361-
}
362-
defer rows.Close()
365+
// checkTTLProgressAgainstBaseline checks if each leaseholder has made progress
366+
// on at least one of their original ranges compared to the provided baseline.
367+
func checkTTLProgressAgainstBaseline(
368+
ctx context.Context, db *gosql.DB, baseline map[int]map[int]int,
369+
) error {
370+
query := `
371+
WITH r AS (
372+
SHOW RANGES FROM TABLE ttldb.tab1 WITH DETAILS
373+
)
374+
SELECT
375+
range_id,
376+
lease_holder,
377+
count(*) AS key_count
378+
FROM
379+
r,
380+
LATERAL crdb_internal.list_sql_keys_in_range(range_id)
381+
GROUP BY
382+
range_id,
383+
lease_holder
384+
ORDER BY
385+
range_id`
363386

364-
for rows.Next() {
365-
var rangeID, leaseHolder, keyCount int
366-
if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
367-
return err
368-
}
369-
current[rangeID] = keyCount
387+
current := make(map[int]int) // rangeID -> keyCount
388+
389+
rows, err := db.QueryContext(ctx, query)
390+
if err != nil {
391+
return err
392+
}
393+
defer rows.Close()
394+
395+
for rows.Next() {
396+
var rangeID, leaseHolder, keyCount int
397+
if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
398+
return err
370399
}
400+
current[rangeID] = keyCount
401+
}
371402

372-
for leaseHolder, ranges := range baseline {
373-
madeProgress := false
374-
for rangeID, oldCount := range ranges {
375-
newCount, ok := current[rangeID]
376-
if !ok {
377-
return errors.Newf("range %d (from leaseholder %d) not found in follow-up check", rangeID, leaseHolder)
378-
}
379-
if newCount < oldCount {
380-
madeProgress = true
381-
break
382-
}
403+
for leaseHolder, ranges := range baseline {
404+
madeProgress := false
405+
for rangeID, oldCount := range ranges {
406+
newCount, ok := current[rangeID]
407+
if !ok {
408+
return errors.Newf("range %d (from leaseholder %d) not found in follow-up check", rangeID, leaseHolder)
383409
}
384-
if !madeProgress {
385-
return errors.Newf("leaseholder %d made no progress on any of their original ranges", leaseHolder)
410+
if newCount < oldCount {
411+
madeProgress = true
386412
}
387413
}
388-
389-
return nil
414+
if !madeProgress {
415+
return errors.Newf("leaseholder %d made no progress on any of their original ranges", leaseHolder)
416+
}
390417
}
391418

392-
return testutils.SucceedsWithinError(compareWithBaseline, 20*time.Second)
419+
return nil
393420
}
394421

395422
// findRunningJob checks the current state of the TTL job and returns metadata

pkg/server/status.go

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ import (
8686
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
8787
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
8888
"github.com/cockroachdb/cockroach/pkg/util/uint128"
89-
"github.com/cockroachdb/cockroach/pkg/util/uuid"
9089
"github.com/cockroachdb/errors"
9190
"github.com/cockroachdb/redact"
9291
"github.com/google/pprof/profile"
@@ -416,11 +415,6 @@ func (b *baseStatusServer) localTxnIDResolution(
416415
) *serverpb.TxnIDResolutionResponse {
417416
txnIDCache := b.sqlServer.pgServer.SQLServer.GetTxnIDCache()
418417

419-
unresolvedTxnIDs := make(map[uuid.UUID]struct{}, len(req.TxnIDs))
420-
for _, txnID := range req.TxnIDs {
421-
unresolvedTxnIDs[txnID] = struct{}{}
422-
}
423-
424418
resp := &serverpb.TxnIDResolutionResponse{
425419
ResolvedTxnIDs: make([]contentionpb.ResolvedTxnID, 0, len(req.TxnIDs)),
426420
}
@@ -434,12 +428,10 @@ func (b *baseStatusServer) localTxnIDResolution(
434428
}
435429
}
436430

437-
// If we encounter any transaction ID that we cannot resolve, we tell the
438-
// txnID cache to drain its write buffer (note: The .DrainWriteBuffer() call
439-
// is asynchronous). The client of this RPC will perform retries.
440-
if len(unresolvedTxnIDs) > 0 {
441-
txnIDCache.DrainWriteBuffer()
442-
}
431+
// Note(alyshan): TxnIDResolution is only called by the contention event resolver today.
432+
// The resolver relies on these resolution calls to trigger a drain of the writer buffer
433+
// on the txn id cache.
434+
txnIDCache.DrainWriteBuffer()
443435

444436
return resp
445437
}

pkg/sql/contention/resolver.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ const (
7373
// in-memory data corruption (shouldn't happen in normal circumstances,
7474
// since access to txnID cache is all synchronized). In this case, no
7575
// amount of retries will be able to resolveLocked the txnID.
76-
retryBudgetForMissingResult = uint32(1)
76+
retryBudgetForMissingResult = uint32(2)
7777

7878
// retryBudgetForRPCFailure is the number of times the resolverQueue will
7979
// retry resolving until giving up. This needs to be a finite number to handle

0 commit comments

Comments
 (0)