server: make TestNodeStatusWritten robust against background activity

tbg · tbg · commit b4a64c198edd · 2025-12-17T14:46:17.000Z
The test was flaky because it asserted exact metric values which could be affected by background system activity (GC, intent resolution, migrations). Rewrite the test to use a simpler, more robust approach: - Use testutils.SucceedsSoon with unique keys per attempt to verify that livecount increases after writes - Disable the split queue to ensure replica count only changes due to our explicit AdminSplit - Remove the complex compareNodeStatus helper in favor of direct assertions Fixes: #159662 Release note: None
diff --git a/pkg/server/node_test.go b/pkg/server/node_test.go
@@ -340,126 +340,6 @@ func TestCorruptedClusterID(t *testing.T) {
 	}
 }
 
-// compareNodeStatus ensures that the actual node status for the passed in
-// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
-// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
-// the bytes and counts for Live, Key and Val are at least the expected value.
-// And that UpdatedAt has increased.
-// The latest actual stats are returned.
-func compareNodeStatus(
-	t *testing.T,
-	ts serverutils.TestServerInterface,
-	expectedNodeStatus *statuspb.NodeStatus,
-	testNumber int,
-) *statuspb.NodeStatus {
-	// ========================================
-	// Read NodeStatus from server and validate top-level fields.
-	// ========================================
-	nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
-	nodeStatus := &statuspb.NodeStatus{}
-	if err := ts.DB().GetProto(context.Background(), nodeStatusKey, nodeStatus); err != nil {
-		t.Fatalf("%d: failure getting node status: %s", testNumber, err)
-	}
-
-	// Descriptor values should be exactly equal to expected.
-	if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) {
-		t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, &e, &a)
-	}
-
-	// ========================================
-	// Ensure all expected stores are represented in the node status.
-	// ========================================
-	storesToMap := func(ns *statuspb.NodeStatus) map[roachpb.StoreID]statuspb.StoreStatus {
-		strMap := make(map[roachpb.StoreID]statuspb.StoreStatus, len(ns.StoreStatuses))
-		for _, str := range ns.StoreStatuses {
-			strMap[str.Desc.StoreID] = str
-		}
-		return strMap
-	}
-	actualStores := storesToMap(nodeStatus)
-	expectedStores := storesToMap(expectedNodeStatus)
-
-	if a, e := len(actualStores), len(expectedStores); a != e {
-		t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e)
-	}
-	for key := range expectedStores {
-		if _, ok := actualStores[key]; !ok {
-			t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key)
-		}
-	}
-	if t.Failed() {
-		t.FailNow()
-	}
-
-	// ========================================
-	// Ensure all metric sets (node and store level) are consistent with
-	// expected status.
-	// ========================================
-
-	// CompareMetricMaps accepts an actual and expected metric maps, along with
-	// two lists of string keys. For metrics with keys in the 'equal' map, the
-	// actual value must be equal to the expected value. For keys in the
-	// 'greater' map, the actual value must be greater than or equal to the
-	// expected value.
-	compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) {
-		// Make sure the actual value map contains all values in expected map.
-		for key := range expected {
-			if _, ok := actual[key]; !ok {
-				t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key)
-			}
-		}
-		if t.Failed() {
-			return
-		}
-
-		// For each equal key, ensure that the actual value is equal to expected
-		// key.
-		for _, key := range equal {
-			if _, ok := actual[key]; !ok {
-				t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key)
-				continue
-			}
-			if a, e := actual[key], expected[key]; a != e {
-				t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
-			}
-		}
-		for _, key := range greater {
-			if _, ok := actual[key]; !ok {
-				t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key)
-				continue
-			}
-			if a, e := actual[key], expected[key]; a < e {
-				t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
-			}
-		}
-	}
-
-	compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{
-		"exec.success",
-		"exec.error",
-	})
-
-	for key := range actualStores {
-		// Directly verify a subset of metrics which have predictable output.
-		compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics,
-			[]string{
-				"replicas",
-				"replicas.leaseholders",
-			},
-			[]string{
-				"livecount",
-				"keycount",
-				"valcount",
-			})
-	}
-
-	if t.Failed() {
-		t.FailNow()
-	}
-
-	return nodeStatus
-}
-
 // TestNodeEmitsDiskSlowEvents verifies that disk slow events are emitted for
 // each store that is slow.
 func TestNodeEmitsDiskSlowEvents(t *testing.T) {
@@ -612,6 +492,11 @@ func TestNodeStatusWritten(t *testing.T) {
 	ts, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{
 		DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
 		DisableEventLog:   true,
+		Knobs: base.TestingKnobs{
+			Store: &kvserver.StoreTestingKnobs{
+				DisableSplitQueue: true,
+			},
+		},
 	})
 	defer ts.Stopper().Stop(context.Background())
 	ctx := context.Background()
@@ -625,7 +510,6 @@ func TestNodeStatusWritten(t *testing.T) {
 	s.WaitForInit()
 
 	content := "junk"
-	leftKey := "a"
 
 	// Scan over all keys to "wake up" all replicas (force a lease holder election).
 	if _, err := kvDB.Scan(context.Background(), keys.MetaMax, keys.MaxKey, 0); err != nil {
@@ -648,50 +532,6 @@ func TestNodeStatusWritten(t *testing.T) {
 		return nil
 	})
 
-	// ========================================
-	// Construct an initial expectation for NodeStatus to compare to the first
-	// status produced by the server.
-	// ========================================
-	expectedNodeStatus := &statuspb.NodeStatus{
-		Desc:      ts.Node().(*Node).Descriptor,
-		StartedAt: 0,
-		UpdatedAt: 0,
-		Metrics: map[string]float64{
-			"exec.success": 0,
-			"exec.error":   0,
-		},
-	}
-
-	expectedStoreStatuses := make(map[roachpb.StoreID]statuspb.StoreStatus)
-	if err := ts.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error {
-		desc, err := s.Descriptor(ctx, false /* useCached */)
-		if err != nil {
-			t.Fatal(err)
-		}
-		expectedReplicas := 0
-		if s.StoreID() == roachpb.StoreID(1) {
-			expectedReplicas = initialRanges
-		}
-		stat := statuspb.StoreStatus{
-			Desc: *desc,
-			Metrics: map[string]float64{
-				"replicas":              float64(expectedReplicas),
-				"replicas.leaseholders": float64(expectedReplicas),
-				"livebytes":             0,
-				"keybytes":              0,
-				"valbytes":              0,
-				"livecount":             0,
-				"keycount":              0,
-				"valcount":              0,
-			},
-		}
-		expectedNodeStatus.StoreStatuses = append(expectedNodeStatus.StoreStatuses, stat)
-		expectedStoreStatuses[s.StoreID()] = stat
-		return nil
-	}); err != nil {
-		t.Fatal(err)
-	}
-
 	// Function to force summaries to be written synchronously, including all
 	// data currently in the event pipeline. Only one of the stores has
 	// replicas, so there are no concerns related to quorum writes; if there
@@ -707,82 +547,110 @@ func TestNodeStatusWritten(t *testing.T) {
 		}
 	}
 
-	// Verify initial status.
-	forceWriteStatus()
-	expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 1)
-	for _, s := range expectedNodeStatus.StoreStatuses {
-		expectedStoreStatuses[s.Desc.StoreID] = s
+	// Helper to read current node status.
+	readNodeStatus := func() *statuspb.NodeStatus {
+		nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
+		nodeStatus := &statuspb.NodeStatus{}
+		if err := ts.DB().GetProto(ctx, nodeStatusKey, nodeStatus); err != nil {
+			t.Fatalf("failure getting node status: %s", err)
+		}
+		return nodeStatus
+	}
+
+	// Helper to get store metrics from node status.
+	getStoreMetrics := func(ns *statuspb.NodeStatus, storeID roachpb.StoreID) map[string]float64 {
+		for _, ss := range ns.StoreStatuses {
+			if ss.Desc.StoreID == storeID {
+				return ss.Metrics
+			}
+		}
+		t.Fatalf("store %d not found in node status", storeID)
+		return nil
 	}
 
 	// ========================================
-	// Put some data into the K/V store and confirm change to status.
+	// Verify initial status has expected structure.
 	// ========================================
+	forceWriteStatus()
+	initialStatus := readNodeStatus()
 
-	splitKey := "b"
-	rightKey := "c"
-
-	// Write some values left and right of the proposed split key.
-	if err := kvDB.Put(ctx, leftKey, content); err != nil {
-		t.Fatal(err)
+	// Verify node descriptor is present.
+	expectedDesc := ts.Node().(*Node).Descriptor
+	if !reflect.DeepEqual(initialStatus.Desc, expectedDesc) {
+		t.Errorf("Descriptor does not match expected.\nexpected: %s\nactual: %s", &expectedDesc, &initialStatus.Desc)
 	}
-	if err := kvDB.Put(ctx, rightKey, content); err != nil {
-		t.Fatal(err)
+
+	// Verify initial replica count.
+	store1Metrics := getStoreMetrics(initialStatus, roachpb.StoreID(1))
+	if int(store1Metrics["replicas"]) != initialRanges {
+		t.Errorf("expected %d replicas, got %v", initialRanges, store1Metrics["replicas"])
 	}
 
-	// Increment metrics on the node
-	expectedNodeStatus.Metrics["exec.success"] += 2
+	// ========================================
+	// Put some data into the K/V store and confirm livecount increases.
+	// ========================================
+	// We use SucceedsSoon with unique keys each attempt to handle rare cases
+	// where background activity (GC, intent resolution) might temporarily
+	// decrease livecount. By writing unique keys each attempt, we guarantee
+	// eventual progress.
+	attempt := 0
+	testutils.SucceedsSoon(t, func() error {
+		attempt++
+		forceWriteStatus()
+		beforeStatus := readNodeStatus()
+		beforeLivecount := getStoreMetrics(beforeStatus, roachpb.StoreID(1))["livecount"]
+
+		// Write unique keys each attempt to guarantee livecount increases.
+		leftKey := fmt.Sprintf("a-attempt-%d", attempt)
+		rightKey := fmt.Sprintf("c-attempt-%d", attempt)
+		if err := kvDB.Put(ctx, leftKey, content); err != nil {
+			return err
+		}
+		if err := kvDB.Put(ctx, rightKey, content); err != nil {
+			return err
+		}
 
-	// Increment metrics on the first store.
-	store1 := expectedStoreStatuses[roachpb.StoreID(1)].Metrics
-	store1["livecount"]++
-	store1["keycount"]++
-	store1["valcount"]++
-	store1["livebytes"]++
-	store1["keybytes"]++
-	store1["valbytes"]++
+		forceWriteStatus()
+		afterStatus := readNodeStatus()
+		afterLivecount := getStoreMetrics(afterStatus, roachpb.StoreID(1))["livecount"]
 
-	forceWriteStatus()
-	expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 2)
-	for _, s := range expectedNodeStatus.StoreStatuses {
-		expectedStoreStatuses[s.Desc.StoreID] = s
-	}
+		if afterLivecount < beforeLivecount+2 {
+			return errors.Errorf("livecount did not increase by at least 2 after writing keys: before=%v, after=%v (attempt %d)",
+				beforeLivecount, afterLivecount, attempt)
+		}
+		return nil
+	})
 
 	// ========================================
-	// Perform an admin split and verify that status is updated.
+	// Perform an admin split and verify that replica count increases.
 	// ========================================
+	forceWriteStatus()
+	beforeSplitStatus := readNodeStatus()
+	beforeReplicas := getStoreMetrics(beforeSplitStatus, roachpb.StoreID(1))["replicas"]
 
-	// Split the range.
-	if err := kvDB.AdminSplit(
-		context.Background(),
-		splitKey,
-		hlc.MaxTimestamp, /* expirationTime */
-	); err != nil {
+	// Split the range at "b".
+	splitKey := "b"
+	if err := kvDB.AdminSplit(ctx, splitKey, hlc.MaxTimestamp /* expirationTime */); err != nil {
 		t.Fatal(err)
 	}
 
-	// Write on both sides of the split to ensure that the raft machinery
-	// is running.
-	if err := kvDB.Put(ctx, leftKey, content); err != nil {
+	// Write on both sides of the split to ensure that the raft machinery is running.
+	if err := kvDB.Put(ctx, "a", content); err != nil {
 		t.Fatal(err)
 	}
-	if err := kvDB.Put(ctx, rightKey, content); err != nil {
+	if err := kvDB.Put(ctx, "c", content); err != nil {
 		t.Fatal(err)
 	}
 
-	// Increment metrics on the node
-	expectedNodeStatus.Metrics["exec.success"] += 2
-
-	// Increment metrics on the first store.
-	store1 = expectedStoreStatuses[roachpb.StoreID(1)].Metrics
-	store1["replicas"]++
-	store1["replicas.leaders"]++
-	store1["replicas.leaseholders"]++
-	store1["ranges"]++
-
+	// Verify replica count increased. Since the split queue is disabled,
+	// the only splits are those we explicitly perform, so this is stable.
 	forceWriteStatus()
-	expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 3)
-	for _, s := range expectedNodeStatus.StoreStatuses {
-		expectedStoreStatuses[s.Desc.StoreID] = s
+	afterSplitStatus := readNodeStatus()
+	afterReplicas := getStoreMetrics(afterSplitStatus, roachpb.StoreID(1))["replicas"]
+
+	if afterReplicas <= beforeReplicas {
+		t.Errorf("replica count did not increase after split: before=%v, after=%v",
+			beforeReplicas, afterReplicas)
 	}
 }