Merge #150743 #151063

craig[bot] · alyshanjahani-crl · spilchen · craig[bot] · commit fe8f9abcad33 · 2025-07-30T20:15:55.000Z
150743: contention: Increase RetryBudgetForMissingResult r=alyshanjahani-crl a=alyshanjahani-crl Previously the retry budget was set to 1, however this budget lead to a significant amount of failed resolutions. To see why a retry budget of 1 is not sufficient consider the case where an in progress transaction is in the writer buffer when resolution is attempted. The in progress txn is then ingested into the cache after the txn resolution endpoint drains the write buffer - i.e. it is stored in the cache with the appstatspb.InvalidTransactionFingerprintID value. Now the transaction finishes and its respective fingerprint ID is recorded. However, it is in the writer buffer of the txn id cache. When resolution is attempted again, the lookup gets the invalid / in-progress value that is stored in the cache. The subsequent flush then gets the cache to ingest the actual fingerprint ID value for the txn. But we've run out of budget, and don't retry resolution. This commit increases the budget to 2. In addition to handling the case above, experimentally it shows to lower the number of failed resolutions (see issue linked). Lastly, this commit removes dead code in the TxnID resolution endpoint. A map was being created and never added to. The logic resulted in the RPC flushing the TxnID Cache on every invocation, that behaviour is preserved and made more explicit. Fixes: #148686 Release note: None 151063: roachtest/ttl: fix TTL restart test flakiness r=spilchen a=spilchen The TTL restart test was experiencing flakiness due to the default stability window causing delays in replanning when nodes changed. The test would wait for TTL progress across all nodes but the replanning logic wouldn't trigger immediately when nodes were restarted. This change disables the stability window. This also fixes a bug in the logic that checks if the TTL job is progressing. It would look for key removal across all ranges over time. The existing check repeatedly change the baseline. We now save that the baseline and compare it with each check. Release note: None Epic: None Closes #151011 Co-authored-by: Alyshan Jahani <alyshan@cockroachlabs.com> Co-authored-by: Matt Spilchen <matt.spilchen@cockroachlabs.com>
diff --git a/pkg/cmd/roachtest/tests/ttl_restart.go b/pkg/cmd/roachtest/tests/ttl_restart.go
@@ -93,6 +93,8 @@ func runTTLRestart(ctx context.Context, t test.Test, c cluster.Cluster, numResta
 			// Speed up the test by doing the replan check often and with a low threshold.
 			"SET CLUSTER SETTING sql.ttl.replan_flow_frequency = '15s'",
 			"SET CLUSTER SETTING sql.ttl.replan_flow_threshold = '0.1'",
+			// Disable the stability window to ensure immediate replanning on node changes.
+			"SET CLUSTER SETTING sql.ttl.replan_stability_window = 1",
 			// Add additional logging to help debug the test on failure.
 			"SET CLUSTER SETTING server.debug.default_vmodule = 'ttljob_processor=1,distsql_plan_bulk=1'",
 			// Create the schema to be used in the test
@@ -165,8 +167,13 @@ func runTTLRestart(ctx context.Context, t test.Test, c cluster.Cluster, numResta
 		db = c.Conn(ctx, t.L(), jobInfo.CoordinatorID)
 
 		t.Status("wait for TTL deletions to start happening")
+		// Take baseline once and reuse it for all progress checks
+		baseline, err := takeProgressBaseline(ctx, t, db)
+		if err != nil {
+			return errors.Wrapf(err, "error taking TTL progress baseline")
+		}
 		waitForTTLProgressAcrossAllNodes := func() error {
-			if err := waitForTTLProgressAcrossAllRanges(ctx, db); err != nil {
+			if err := checkTTLProgressAgainstBaseline(ctx, db, baseline); err != nil {
 				return errors.Wrapf(err, "error waiting for TTL progress after restart")
 			}
 			return nil
@@ -310,11 +317,11 @@ func distributeLeases(ctx context.Context, t test.Test, db *gosql.DB) error {
 
 }
 
-// waitForTTLProgressAcrossAllRanges ensures that TTL deletions are happening across
-// all ranges. It builds a baseline of key counts for each leaseholder's ranges,
-// and later checks that each leaseholder made progress on at least one of those ranges,
-// regardless of current leaseholder assignment.
-func waitForTTLProgressAcrossAllRanges(ctx context.Context, db *gosql.DB) error {
+// takeProgressBaseline captures the initial key counts for each range and its leaseholder.
+// This baseline will be used later to check if TTL progress is being made.
+func takeProgressBaseline(
+	ctx context.Context, t test.Test, db *gosql.DB,
+) (map[int]map[int]int, error) {
 	query := `
 		WITH r AS (
 			SHOW RANGES FROM TABLE ttldb.tab1 WITH DETAILS
@@ -337,59 +344,79 @@ func waitForTTLProgressAcrossAllRanges(ctx context.Context, db *gosql.DB) error
 
 	rows, err := db.QueryContext(ctx, query)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer rows.Close()
 
 	for rows.Next() {
 		var rangeID, leaseHolder, keyCount int
 		if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
-			return err
+			return nil, err
 		}
 		if _, ok := baseline[leaseHolder]; !ok {
 			baseline[leaseHolder] = make(map[int]int)
 		}
 		baseline[leaseHolder][rangeID] = keyCount
 	}
 
-	compareWithBaseline := func() error {
-		current := make(map[int]int) // rangeID -> keyCount
+	return baseline, nil
+}
 
-		rows, err := db.QueryContext(ctx, query)
-		if err != nil {
-			return err
-		}
-		defer rows.Close()
+// checkTTLProgressAgainstBaseline checks if each leaseholder has made progress
+// on at least one of their original ranges compared to the provided baseline.
+func checkTTLProgressAgainstBaseline(
+	ctx context.Context, db *gosql.DB, baseline map[int]map[int]int,
+) error {
+	query := `
+		WITH r AS (
+			SHOW RANGES FROM TABLE ttldb.tab1 WITH DETAILS
+		)
+		SELECT
+		  range_id,
+			lease_holder,
+			count(*) AS key_count
+		FROM
+			r,
+			LATERAL crdb_internal.list_sql_keys_in_range(range_id)
+		GROUP BY
+		  range_id,
+			lease_holder
+		ORDER BY
+		  range_id`
 
-		for rows.Next() {
-			var rangeID, leaseHolder, keyCount int
-			if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
-				return err
-			}
-			current[rangeID] = keyCount
+	current := make(map[int]int) // rangeID -> keyCount
+
+	rows, err := db.QueryContext(ctx, query)
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var rangeID, leaseHolder, keyCount int
+		if err := rows.Scan(&rangeID, &leaseHolder, &keyCount); err != nil {
+			return err
 		}
+		current[rangeID] = keyCount
+	}
 
-		for leaseHolder, ranges := range baseline {
-			madeProgress := false
-			for rangeID, oldCount := range ranges {
-				newCount, ok := current[rangeID]
-				if !ok {
-					return errors.Newf("range %d (from leaseholder %d) not found in follow-up check", rangeID, leaseHolder)
-				}
-				if newCount < oldCount {
-					madeProgress = true
-					break
-				}
+	for leaseHolder, ranges := range baseline {
+		madeProgress := false
+		for rangeID, oldCount := range ranges {
+			newCount, ok := current[rangeID]
+			if !ok {
+				return errors.Newf("range %d (from leaseholder %d) not found in follow-up check", rangeID, leaseHolder)
 			}
-			if !madeProgress {
-				return errors.Newf("leaseholder %d made no progress on any of their original ranges", leaseHolder)
+			if newCount < oldCount {
+				madeProgress = true
 			}
 		}
-
-		return nil
+		if !madeProgress {
+			return errors.Newf("leaseholder %d made no progress on any of their original ranges", leaseHolder)
+		}
 	}
 
-	return testutils.SucceedsWithinError(compareWithBaseline, 20*time.Second)
+	return nil
 }
 
 // findRunningJob checks the current state of the TTL job and returns metadata
diff --git a/pkg/server/status.go b/pkg/server/status.go
@@ -86,7 +86,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
 	"github.com/cockroachdb/cockroach/pkg/util/uint128"
-	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/redact"
 	"github.com/google/pprof/profile"
@@ -416,11 +415,6 @@ func (b *baseStatusServer) localTxnIDResolution(
 ) *serverpb.TxnIDResolutionResponse {
 	txnIDCache := b.sqlServer.pgServer.SQLServer.GetTxnIDCache()
 
-	unresolvedTxnIDs := make(map[uuid.UUID]struct{}, len(req.TxnIDs))
-	for _, txnID := range req.TxnIDs {
-		unresolvedTxnIDs[txnID] = struct{}{}
-	}
-
 	resp := &serverpb.TxnIDResolutionResponse{
 		ResolvedTxnIDs: make([]contentionpb.ResolvedTxnID, 0, len(req.TxnIDs)),
 	}
@@ -434,12 +428,10 @@ func (b *baseStatusServer) localTxnIDResolution(
 		}
 	}
 
-	// If we encounter any transaction ID that we cannot resolve, we tell the
-	// txnID cache to drain its write buffer (note: The .DrainWriteBuffer() call
-	// is asynchronous). The client of this RPC will perform retries.
-	if len(unresolvedTxnIDs) > 0 {
-		txnIDCache.DrainWriteBuffer()
-	}
+	// Note(alyshan): TxnIDResolution is only called by the contention event resolver today.
+	// The resolver relies on these resolution calls to trigger a drain of the writer buffer
+	// on the txn id cache.
+	txnIDCache.DrainWriteBuffer()
 
 	return resp
 }
diff --git a/pkg/sql/contention/resolver.go b/pkg/sql/contention/resolver.go
@@ -73,7 +73,7 @@ const (
 	//    in-memory data corruption (shouldn't happen in normal circumstances,
 	//    since access to txnID cache is all synchronized). In this case, no
 	//    amount of retries will be able to resolveLocked the txnID.
-	retryBudgetForMissingResult = uint32(1)
+	retryBudgetForMissingResult = uint32(2)
 
 	// retryBudgetForRPCFailure is the number of times the resolverQueue will
 	// retry resolving until giving up. This needs to be a finite number to handle