kvserver: add kv.closed_timestamp.policy_switch_latency_bucket_exceed_threshold

wenyihu6 · wenyihu6 · commit 55bde387a92b · 2025-04-21T18:12:22.000-04:00
This commit introduces a new cluster setting: kv.closed_timestamp.policy_switch_latency_bucket_exceed_threshold. It defines the fraction of the closed timestamp policy bucket width that must be exceeded before a policy switch is triggered. This helps prevent aggressive policy changes for ranges near bucket boundaries, reducing excessive updates sent via side transport. Part of: #143890 Release note: none Epic: none
diff --git a/pkg/kv/kvserver/closedts/policy_calculation.go b/pkg/kv/kvserver/closedts/policy_calculation.go
@@ -14,8 +14,90 @@ import (
 )
 
 // FindBucketBasedOnNetworkRTT maps a network RTT to a closed timestamp policy
-// bucket.
+// with zero dampening.
 func FindBucketBasedOnNetworkRTT(networkRTT time.Duration) ctpb.RangeClosedTimestampPolicy {
+	return FindBucketBasedOnNetworkRTTWithDampening(ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO, networkRTT, 0)
+}
+
+// FindBucketBasedOnNetworkRTTWithDampening calculates a new closed timestamp policy
+// based on the old policy, the network RTT, and a boundary percentage.
+//
+// 1. If old policy or new policy is LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO,
+// the new policy is returned.
+//
+// 2. If new policy jumps to a non-adjacent bucket, the new policy is returned.
+//
+// 3. If dampening is 0 or policy is unchanged, the new policy is returned.
+//
+// 4. Otherwise, the new policy is returned if and only if the network RTT has
+// crossed the boundary of the new policy.
+//
+// Policy change diagram with 20% boundary between two adjacent buckets:
+//
+// Example: boundaryPercent = 20%
+// Case 1: Moving to higher latency bucket (old policy < new policy)
+// RTT (ms)   |----------20ms----------|----------40ms----------|
+// Policy     |     <20ms bucket       |      <40ms bucket      |
+//
+//		                                         ^
+//	                                   |--4ms--| RTT must be >=
+//			                                       | (40ms + 20ms*20%) = 44ms
+//			                                       | to move to <40ms bucket
+//
+// Case 2: Moving to lower latency bucket (old policy > new policy)
+// RTT (ms)   |----------20ms----------|----------40ms----------|
+// Policy     |     <20ms bucket       |      <40ms bucket      |
+//
+//		                         ^
+//	         RTT must go below |--4ms--|
+//	  (20ms - 20ms*20%) = 16ms |
+//	   to move to <20ms bucket |
+func FindBucketBasedOnNetworkRTTWithDampening(
+	oldPolicy ctpb.RangeClosedTimestampPolicy, networkRTT time.Duration, boundaryPercent float64,
+) ctpb.RangeClosedTimestampPolicy {
+	// Calculate the new policy based on network RTT.
+	newPolicy := findBucketBasedOnNetworkRTT(networkRTT)
+
+	if newPolicy == ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO ||
+		oldPolicy == ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO || boundaryPercent == 0 {
+		return newPolicy
+	}
+
+	// Apply the new policy if policy is unchanged, or if there's a non-adjacent
+	// bucket jump.
+	if newPolicy == oldPolicy || math.Abs(float64(newPolicy-oldPolicy)) > 1 {
+		return newPolicy
+	}
+
+	// Calculate bucket number by subtracting base policy and adjusting for
+	// zero-based indexing.
+	bucket := int(newPolicy) - int(ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO) - 1
+	intervalNanos := float64(closedTimestampPolicyBucketWidth.Nanoseconds())
+	switch {
+	case oldPolicy < newPolicy:
+		// The new policy has a higher latency threshold. Only switch to the
+		// higher latency bucket if the RTT exceeds the bucket boundary.
+		higherLatencyBucketThreshold := time.Duration((float64(bucket) + boundaryPercent) * intervalNanos)
+		if networkRTT >= higherLatencyBucketThreshold {
+			return newPolicy
+		}
+		return oldPolicy
+	case oldPolicy > newPolicy:
+		// The new policy has a lower latency threshold. Only switch to the lower
+		// latency bucket if the RTT is below the bucket boundary.
+		lowerLatencyBucketThreshold := time.Duration((float64(bucket) + 1 - boundaryPercent) * intervalNanos)
+		if networkRTT < lowerLatencyBucketThreshold {
+			return newPolicy
+		}
+		return oldPolicy
+	default:
+		panic("unexpected condition")
+	}
+}
+
+// findBucketBasedOnNetworkRTT maps a network RTT to a closed timestamp policy
+// bucket.
+func findBucketBasedOnNetworkRTT(networkRTT time.Duration) ctpb.RangeClosedTimestampPolicy {
 	// If maxLatency is negative (i.e. no peer latency is provided), return
 	// LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO
 	if networkRTT < 0 {
diff --git a/pkg/kv/kvserver/closedts/policy_calculation_test.go b/pkg/kv/kvserver/closedts/policy_calculation_test.go
@@ -360,6 +360,15 @@ func TestNetworkRTTAndPolicyCalculations(t *testing.T) {
 				"expected policy %v for RTT %v, got %v",
 				tc.expectedPolicy, tc.networkRTT, policy)
 
+			// Test RTT -> Policy with 0 percent dampening. We expect the same outcome
+			// as FindBucketBasedOnNetworkRTT regardless of oldPolicy.
+			for oldPolicy := ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO; oldPolicy <= ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_EQUAL_OR_GREATER_THAN_300MS; oldPolicy++ {
+				newPolicy := FindBucketBasedOnNetworkRTTWithDampening(oldPolicy, tc.networkRTT, 0)
+				require.Equal(t, tc.expectedPolicy, newPolicy,
+					"expected policy %v for RTT %v, got %v",
+					tc.expectedPolicy, tc.networkRTT, policy)
+			}
+
 			// Test Policy -> RTT conversion.
 			rtt := computeNetworkRTTBasedOnPolicy(policy)
 			require.Equal(t, tc.expectedRTT, rtt,
@@ -368,3 +377,198 @@ func TestNetworkRTTAndPolicyCalculations(t *testing.T) {
 		})
 	}
 }
+
+// TestRefreshPolicyWithDampening tests the RefreshPolicy method of
+// replica.RefreshPolicy works expectedly with different dampening fractions.
+func TestRefreshPolicyWithDampening(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	testCases := []struct {
+		name              string
+		dampeningFraction float64
+		oldPolicy         ctpb.RangeClosedTimestampPolicy
+		networkRTT        time.Duration
+		expectedPolicy    ctpb.RangeClosedTimestampPolicy
+	}{
+		{
+			name:              "from no latency info to low latency",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO,
+			networkRTT:        10 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS,
+		},
+		{
+			name:              "from low latency to no latency info",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS,
+			networkRTT:        -1 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_WITH_NO_LATENCY_INFO,
+		},
+		{
+			name:              "latency increases but below the lower bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// 42ms is above 40ms but below the 40+20ms*0.2=44ms boundary.
+			networkRTT:     42 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "latency increases and above the lower bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// 44ms is above the 40+20ms*0.2=44ms boundary.
+			networkRTT:     44 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+		},
+		{
+			name:              "latency increases to next bucket and above its upper bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS,
+			// 38 is above 20ms+20*0.2=24ms and above the 40-20*0.2=36ms threshold.
+			networkRTT:     38 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "latency drops to previous bucket but above the upper bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// 18ms is below 20ms but above the 20ms-20ms*0.2=16ms boundary.
+			networkRTT:     18 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "latency drops to previous bucket and below the upper bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// 14ms is below 20ms and below the 20ms-20ms*0.2=16ms boundary.
+			networkRTT:     14 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS,
+		},
+		{
+			name:              "latency drops to previous bucket and below the lower bound threshold",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// 3ms is below 20ms and below the 20ms-20ms*0.2=16ms boundary and below 20ms*0.2=5ms.
+			networkRTT:     3 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_20MS,
+		},
+		{
+			name:              "boundary case at 300ms",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS,
+			// 300ms is below the 300ms+20ms*0.2=304ms boundary.
+			networkRTT:     300 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS,
+		},
+		{
+			name:              "boundary case at 320ms",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS,
+			// 320ms is above the 300ms+20ms*0.2=304ms boundary.
+			networkRTT:     320 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_EQUAL_OR_GREATER_THAN_300MS,
+		},
+		{
+			name:              "jump to higher bucket case at 600ms",
+			dampeningFraction: 0.2,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS,
+			// 600ms is above the 300ms+20ms*0.2=304ms boundary.
+			networkRTT:     600 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_EQUAL_OR_GREATER_THAN_300MS,
+		},
+		// Zero Dampening Cases (Most Sensitive)
+		{
+			name:              "zero dampening - tiny increase",
+			dampeningFraction: 0.0,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			networkRTT:        40 * time.Millisecond, // Tiny increase
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+		},
+		{
+			name:              "zero dampening - tiny decrease",
+			dampeningFraction: 0.0,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+			networkRTT:        39 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		// 100% Dampening Cases (Most Conservative)
+		{
+			name:              "full dampening - significant increase",
+			dampeningFraction: 1.0,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			networkRTT:        58 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "full dampening - multi-bucket jump",
+			dampeningFraction: 1.0,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			networkRTT:        60 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_80MS,
+		},
+		// 0.001 Dampening Cases (Very Sensitive but not quite zero)
+		{
+			name:              "0.001 dampening - small increase",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// Just barely above 40ms + (20ms * 0.001) = 40.02ms.
+			networkRTT:     41 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+		},
+		{
+			name:              "0.001 dampening - small decrease",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+			// Just barely below 40ms - (20ms * 0.001) = 39.98ms.
+			networkRTT:     39 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "0.001 dampening - no change on small increase",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			// Just below 40ms + (20ms * 0.001) = 40.02ms.
+			networkRTT:     40 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+		{
+			name:              "0.001 dampening - no change on small decrease",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+			// Just above 40ms - (20ms * 0.001) = 39.98ms.
+			networkRTT:     time.Duration(39.99 * float64(time.Millisecond)),
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_60MS,
+		},
+		{
+			name:              "0.001 dampening - boundary at 300ms",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_300MS,
+			// Just barely above 300ms + (20ms * 0.001) = 300.02ms.
+			networkRTT:     301 * time.Millisecond,
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_EQUAL_OR_GREATER_THAN_300MS,
+		},
+		{
+			name:              "0.001 dampening - multi-bucket jump to higher latency",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+			networkRTT:        100 * time.Millisecond,
+			expectedPolicy:    ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_120MS,
+		},
+		{
+			name:              "0.001 dampening - multi-bucket jump to lower latency",
+			dampeningFraction: 0.001,
+			oldPolicy:         ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_80MS,
+			// Above 40ms - (20ms * 0.001) = 39.98ms, but it is a multi-bucket jump.
+			networkRTT:     time.Duration(39.99 * float64(time.Millisecond)),
+			expectedPolicy: ctpb.LEAD_FOR_GLOBAL_READS_LATENCY_LESS_THAN_40MS,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			newPolicy := FindBucketBasedOnNetworkRTTWithDampening(tc.oldPolicy, tc.networkRTT, tc.dampeningFraction)
+			require.Equal(t, tc.expectedPolicy, newPolicy)
+		})
+	}
+}
diff --git a/pkg/kv/kvserver/closedts/setting.go b/pkg/kv/kvserver/closedts/setting.go
@@ -85,3 +85,16 @@ var LeadForGlobalReadsAutoTuneEnabled = settings.RegisterBoolSetting(
 	metamorphic.ConstantWithTestBool("kv.closed_timestamp.lead_for_global_reads_auto_tune.enabled", false),
 	settings.WithPublic,
 )
+
+// PolicySwitchWhenLatencyExceedsBucketFraction determines the threshold for
+// changing the closed timestamp policy based on observed latency between
+// leaseholders and their furthest follower. This is used to prevent
+// frequent changes in the closed timestamp policy when the latency is close
+// to the boundary of the policy bucket. By default, this is disabled (0).
+var PolicySwitchWhenLatencyExceedsBucketFraction = settings.RegisterFloatSetting(
+	settings.SystemOnly,
+	"kv.closed_timestamp.policy_switch_latency_bucket_exceed_threshold",
+	"the fraction of the closed timestamp policy bucket width which need be "+
+		"exceeded before the closed timestamp policy will be changed",
+	0.2,
+)
diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go
@@ -1373,7 +1373,11 @@ func (r *Replica) RefreshPolicy(latencies map[roachpb.NodeID]time.Duration) {
 			}
 			maxLatency = max(maxLatency, peerLatency)
 		}
-		return closedts.FindBucketBasedOnNetworkRTT(maxLatency)
+		return closedts.FindBucketBasedOnNetworkRTTWithDampening(
+			ctpb.RangeClosedTimestampPolicy(r.cachedClosedTimestampPolicy.Load()),
+			maxLatency,
+			closedts.PolicySwitchWhenLatencyExceedsBucketFraction.Get(&r.store.GetStoreConfig().Settings.SV),
+		)
 	}
 	r.cachedClosedTimestampPolicy.Store(int32(policy()))
 }

Original file line number	Diff line number	Diff line change
`@@ -1373,7 +1373,11 @@ func (r *Replica) RefreshPolicy(latencies map[roachpb.NodeID]time.Duration) {`
`1373`	`1373`	`}`
`1374`	`1374`	`maxLatency = max(maxLatency, peerLatency)`
`1375`	`1375`	`}`
`1376`		`- return closedts.FindBucketBasedOnNetworkRTT(maxLatency)`
	`1376`	`+ return closedts.FindBucketBasedOnNetworkRTTWithDampening(`
	`1377`	`+ ctpb.RangeClosedTimestampPolicy(r.cachedClosedTimestampPolicy.Load()),`
	`1378`	`+ maxLatency,`
	`1379`	`+ closedts.PolicySwitchWhenLatencyExceedsBucketFraction.Get(&r.store.GetStoreConfig().Settings.SV),`
	`1380`	`+ )`
`1377`	`1381`	`}`
`1378`	`1382`	`r.cachedClosedTimestampPolicy.Store(int32(policy()))`
`1379`	`1383`	`}`