sql/ttl: improve TTL replan decision logic

spilchen · spilchen · commit 706d13763f56 · 2025-07-29T15:18:48.000-03:00
Replace calculatePlanGrowth with detectNodeAvailabilityChanges to make TTL job replanning less sensitive to span changes. The new logic focuses specifically on detecting when nodes become unavailable rather than reacting to all plan differences. The previous implementation would trigger replans for span splits/merges that don't actually indicate beneficial restart scenarios. The new approach only considers missing nodes from the original plan, which typically indicates node failures where work redistribution would benefit from restarting the job. It also supports a stability window so that replan decisions need to fire consecutively. This should help eleviate changes in plans due to range cache issues. Fixes #150343 Epic: none Release note (ops change): The 'sql.ttl.replan_flow_threshold' may have been set to 0 to work around the TTL replanner being too sensitive. This fix will alleviate that and any instance that had set replan_flow_threshold to 0 can be reset back to the default.
diff --git a/pkg/sql/ttl/ttljob/ttljob.go b/pkg/sql/ttl/ttljob/ttljob.go
@@ -8,6 +8,7 @@ package ttljob
 import (
 	"context"
 	"math/rand"
+	"sync/atomic"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
@@ -55,6 +56,14 @@ var replanFrequency = settings.RegisterDurationSetting(
 	settings.PositiveDuration,
 )
 
+var replanStabilityWindow = settings.RegisterIntSetting(
+	settings.ApplicationLevel,
+	"sql.ttl.replan_stability_window",
+	"number of consecutive replan evaluations required before triggering a replan; set to 1 to disable stability window",
+	2,
+	settings.PositiveInt,
+)
+
 // rowLevelTTLResumer implements the TTL job. The job can run on any node, but
 // the job node distributes SELECT/DELETE work via DistSQL to ttlProcessor
 // nodes. DistSQL divides work into spans that each ttlProcessor scans in a
@@ -65,6 +74,9 @@ type rowLevelTTLResumer struct {
 	physicalPlan *sql.PhysicalPlan
 	planCtx      *sql.PlanningCtx
 
+	// consecutiveReplanDecisions tracks how many consecutive times replan was deemed necessary.
+	consecutiveReplanDecisions *atomic.Int64
+
 	mu struct {
 		syncutil.Mutex
 		// lastUpdateTime is the wall time of the last job progress update.
@@ -296,7 +308,10 @@ func (t *rowLevelTTLResumer) Resume(ctx context.Context, execCtx interface{}) (r
 	// the TTL job to utilize those nodes for parallel work.
 	replanChecker, cancelReplanner := sql.PhysicalPlanChangeChecker(
 		ctx, t.physicalPlan, makePlan, jobExecCtx,
-		sql.ReplanOnChangedFraction(func() float64 { return replanThreshold.Get(&execCfg.Settings.SV) }),
+		replanDecider(t.consecutiveReplanDecisions,
+			func() int64 { return replanStabilityWindow.Get(&execCfg.Settings.SV) },
+			func() float64 { return replanThreshold.Get(&execCfg.Settings.SV) },
+		),
 		func() time.Duration { return replanFrequency.Get(&execCfg.Settings.SV) },
 	)
 
@@ -507,11 +522,91 @@ func (t *rowLevelTTLResumer) refreshProgress(
 	return newProgress, nil
 }
 
+// replanDecider returns a function that determines whether a TTL job should be
+// replanned based on changes in the physical execution plan. It compares the
+// old and new plans to detect node availability changes and decides if the
+// benefit of replanning (better parallelization) outweighs the cost of
+// restarting the job. It implements a stability window to avoid replanning
+// due to transient changes.
+func replanDecider(
+	consecutiveReplanDecisions *atomic.Int64,
+	stabilityWindowFn func() int64,
+	thresholdFn func() float64,
+) sql.PlanChangeDecision {
+	return func(ctx context.Context, oldPlan, newPlan *sql.PhysicalPlan) bool {
+		changed, growth := detectNodeAvailabilityChanges(oldPlan, newPlan)
+		threshold := thresholdFn()
+		shouldReplan := threshold != 0.0 && growth > threshold
+
+		stabilityWindow := stabilityWindowFn()
+
+		var currentDecisions int64
+		if shouldReplan {
+			currentDecisions = consecutiveReplanDecisions.Add(1)
+		} else {
+			consecutiveReplanDecisions.Store(0)
+			currentDecisions = 0
+		}
+
+		// If stability window is 1, replan immediately. Otherwise, require
+		// consecutive decisions to meet the window threshold.
+		replan := currentDecisions >= stabilityWindow
+
+		// Reset the counter when we decide to replan, since the job will restart
+		if replan {
+			consecutiveReplanDecisions.Store(0)
+		}
+
+		if shouldReplan || growth > 0.1 || log.V(1) {
+			log.Infof(ctx, "Re-planning would add or alter flows on %d nodes / %.2f, threshold %.2f, consecutive decisions %d/%d, replan %v",
+				changed, growth, threshold, currentDecisions, stabilityWindow, replan)
+		}
+
+		return replan
+	}
+}
+
+// detectNodeAvailabilityChanges analyzes differences between two physical plans
+// to determine if nodes have become unavailable. It returns the number of nodes
+// that are no longer available and the fraction of the original plan affected.
+//
+// The function focuses on detecting when nodes from the original plan are missing
+// from the new plan, which typically indicates node failures. When nodes fail,
+// their work gets redistributed to remaining nodes, making a job restart
+// beneficial for better parallelization. We ignore newly added nodes since
+// continuing the current job on existing nodes is usually more efficient than
+// restarting to incorporate new capacity.
+func detectNodeAvailabilityChanges(before, after *sql.PhysicalPlan) (int, float64) {
+	var changed int
+	beforeSpecs, beforeCleanup := before.GenerateFlowSpecs()
+	defer beforeCleanup(beforeSpecs)
+	afterSpecs, afterCleanup := after.GenerateFlowSpecs()
+	defer afterCleanup(afterSpecs)
+
+	// Count nodes from the original plan that are no longer present in the new plan.
+	// We only check nodes in beforeSpecs because we specifically want to detect
+	// when nodes that were doing work are no longer available, which typically
+	// indicates beneficial restart scenarios (node failures where work can be
+	// redistributed more efficiently).
+	for n := range beforeSpecs {
+		if _, ok := afterSpecs[n]; !ok {
+			changed++
+		}
+	}
+
+	var frac float64
+	if changed > 0 {
+		frac = float64(changed) / float64(len(beforeSpecs))
+	}
+	return changed, frac
+}
+
 func init() {
 	jobs.RegisterConstructor(jobspb.TypeRowLevelTTL, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
 		return &rowLevelTTLResumer{
-			job: job,
-			st:  settings,
+			job:                        job,
+			st:                         settings,
+			consecutiveReplanDecisions: &atomic.Int64{},
 		}
 	}, jobs.UsesTenantCostControl)
 }
diff --git a/pkg/sql/ttl/ttljob/ttljob_internal_test.go b/pkg/sql/ttl/ttljob/ttljob_internal_test.go
@@ -8,6 +8,7 @@ package ttljob
 import (
 	"context"
 	"fmt"
+	"sync/atomic"
 	"testing"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
@@ -155,3 +156,192 @@ func TestTTLProgressLifecycle(t *testing.T) {
 	require.Equal(t, int64(1000), ttlProgress.JobDeletedRowCount)
 	require.Len(t, ttlProgress.ProcessorProgresses, 2)
 }
+
+func TestReplanDecider(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	testCases := []struct {
+		desc         string
+		beforeNodes  []base.SQLInstanceID
+		afterNodes   []base.SQLInstanceID
+		threshold    float64
+		expectReplan bool
+	}{
+		{
+			desc:         "nodes don't change",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{1, 2, 3},
+			threshold:    0.1,
+			expectReplan: false,
+		},
+		{
+			desc:         "one node is shutdown",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{1, 3},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "one node is brought online",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{1, 2, 3, 4},
+			threshold:    0.1,
+			expectReplan: false,
+		},
+		{
+			desc:         "one node is replaced",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{1, 2, 4},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "multiple nodes shutdown",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3, 4, 5},
+			afterNodes:   []base.SQLInstanceID{1, 3},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "all nodes replaced",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{4, 5, 6},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "threshold boundary: exactly at threshold",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			afterNodes:   []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8, 9},
+			threshold:    0.1,
+			expectReplan: false,
+		},
+		{
+			desc:         "threshold boundary: just above threshold",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8, 9},
+			afterNodes:   []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "threshold disabled",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3},
+			afterNodes:   []base.SQLInstanceID{1, 2},
+			threshold:    0.0,
+			expectReplan: false,
+		},
+		{
+			desc:         "large scale: many nodes lost",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
+			afterNodes:   []base.SQLInstanceID{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+		{
+			desc:         "mixed scenario: nodes added and removed",
+			beforeNodes:  []base.SQLInstanceID{1, 2, 3, 4, 5},
+			afterNodes:   []base.SQLInstanceID{1, 3, 5, 6, 7, 8},
+			threshold:    0.1,
+			expectReplan: true,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.desc, func(t *testing.T) {
+			// Create atomic counter and set stability window to 1 for immediate replan (current behavior)
+			consecutiveReplanDecisions := &atomic.Int64{}
+			decider := replanDecider(consecutiveReplanDecisions, func() int64 { return 1 }, func() float64 { return testCase.threshold })
+			ctx := context.Background()
+			oldPlan := &sql.PhysicalPlan{}
+			oldPlan.PhysicalInfrastructure = &physicalplan.PhysicalInfrastructure{Processors: nil}
+			for _, nodeID := range testCase.beforeNodes {
+				oldPlan.Processors = append(oldPlan.Processors, physicalplan.Processor{SQLInstanceID: nodeID})
+			}
+			newPlan := &sql.PhysicalPlan{}
+			newPlan.PhysicalInfrastructure = &physicalplan.PhysicalInfrastructure{Processors: nil}
+			for _, nodeID := range testCase.afterNodes {
+				newPlan.Processors = append(newPlan.Processors, physicalplan.Processor{SQLInstanceID: nodeID})
+			}
+			replan := decider(ctx, oldPlan, newPlan)
+			require.Equal(t, testCase.expectReplan, replan)
+		})
+	}
+}
+
+func TestReplanDeciderStabilityWindow(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	testCases := []struct {
+		desc            string
+		stabilityWindow int64
+		threshold       float64
+		planChanges     [][]base.SQLInstanceID // sequence of plan changes
+		expectedReplans []bool                 // expected replan decision for each change
+	}{
+		{
+			desc:            "stability window 1 - immediate replan",
+			stabilityWindow: 1,
+			threshold:       0.1,
+			planChanges:     [][]base.SQLInstanceID{{2, 3}, {2, 4}, {3, 4}},
+			expectedReplans: []bool{true, true, true},
+		},
+		{
+			desc:            "stability window 2 - requires consecutive decisions",
+			stabilityWindow: 2,
+			threshold:       0.1,
+			planChanges:     [][]base.SQLInstanceID{{2, 3}, {2, 4}, {1, 2, 3}},
+			expectedReplans: []bool{false, true, false}, // first false, second true (meets window), third false (reset)
+		},
+		{
+			desc:            "stability window 2 - interrupted sequence",
+			stabilityWindow: 2,
+			threshold:       0.1,
+			planChanges:     [][]base.SQLInstanceID{{2, 3}, {1, 2, 3}, {2, 4}, {3, 4}},
+			expectedReplans: []bool{false, false, false, true}, // interrupted, then consecutive
+		},
+		{
+			desc:            "stability window 3 - three consecutive needed",
+			stabilityWindow: 3,
+			threshold:       0.1,
+			planChanges:     [][]base.SQLInstanceID{{2, 3}, {2, 4}, {3, 4}, {1, 2, 3}},
+			expectedReplans: []bool{false, false, true, false}, // third one triggers replan
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.desc, func(t *testing.T) {
+			consecutiveReplanDecisions := &atomic.Int64{}
+			decider := replanDecider(
+				consecutiveReplanDecisions,
+				func() int64 { return testCase.stabilityWindow },
+				func() float64 { return testCase.threshold },
+			)
+			ctx := context.Background()
+
+			// Use initial plan with nodes 1,2,3
+			initialPlan := &sql.PhysicalPlan{}
+			initialPlan.PhysicalInfrastructure = &physicalplan.PhysicalInfrastructure{Processors: nil}
+			for _, nodeID := range []base.SQLInstanceID{1, 2, 3} {
+				initialPlan.Processors = append(initialPlan.Processors, physicalplan.Processor{SQLInstanceID: nodeID})
+			}
+
+			for i, nodes := range testCase.planChanges {
+				newPlan := &sql.PhysicalPlan{}
+				newPlan.PhysicalInfrastructure = &physicalplan.PhysicalInfrastructure{Processors: nil}
+				for _, nodeID := range nodes {
+					newPlan.Processors = append(newPlan.Processors, physicalplan.Processor{SQLInstanceID: nodeID})
+				}
+
+				replan := decider(ctx, initialPlan, newPlan)
+				if replan != testCase.expectedReplans[i] {
+					t.Errorf("step %d: expected replan=%v, got %v (consecutive count: %d)", i, testCase.expectedReplans[i], replan, consecutiveReplanDecisions.Load())
+				}
+
+				// Update initial plan for next iteration to maintain state
+				initialPlan = newPlan
+			}
+		})
+	}
+}