Skip to content

Commit b4a64c1

Browse files
committed
server: make TestNodeStatusWritten robust against background activity
The test was flaky because it asserted exact metric values which could be affected by background system activity (GC, intent resolution, migrations). Rewrite the test to use a simpler, more robust approach: - Use testutils.SucceedsSoon with unique keys per attempt to verify that livecount increases after writes - Disable the split queue to ensure replica count only changes due to our explicit AdminSplit - Remove the complex compareNodeStatus helper in favor of direct assertions Fixes: #159662 Release note: None
1 parent 6c30c24 commit b4a64c1

File tree

1 file changed

+86
-218
lines changed

1 file changed

+86
-218
lines changed

pkg/server/node_test.go

Lines changed: 86 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -340,126 +340,6 @@ func TestCorruptedClusterID(t *testing.T) {
340340
}
341341
}
342342

343-
// compareNodeStatus ensures that the actual node status for the passed in
344-
// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
345-
// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
346-
// the bytes and counts for Live, Key and Val are at least the expected value.
347-
// And that UpdatedAt has increased.
348-
// The latest actual stats are returned.
349-
func compareNodeStatus(
350-
t *testing.T,
351-
ts serverutils.TestServerInterface,
352-
expectedNodeStatus *statuspb.NodeStatus,
353-
testNumber int,
354-
) *statuspb.NodeStatus {
355-
// ========================================
356-
// Read NodeStatus from server and validate top-level fields.
357-
// ========================================
358-
nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
359-
nodeStatus := &statuspb.NodeStatus{}
360-
if err := ts.DB().GetProto(context.Background(), nodeStatusKey, nodeStatus); err != nil {
361-
t.Fatalf("%d: failure getting node status: %s", testNumber, err)
362-
}
363-
364-
// Descriptor values should be exactly equal to expected.
365-
if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) {
366-
t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, &e, &a)
367-
}
368-
369-
// ========================================
370-
// Ensure all expected stores are represented in the node status.
371-
// ========================================
372-
storesToMap := func(ns *statuspb.NodeStatus) map[roachpb.StoreID]statuspb.StoreStatus {
373-
strMap := make(map[roachpb.StoreID]statuspb.StoreStatus, len(ns.StoreStatuses))
374-
for _, str := range ns.StoreStatuses {
375-
strMap[str.Desc.StoreID] = str
376-
}
377-
return strMap
378-
}
379-
actualStores := storesToMap(nodeStatus)
380-
expectedStores := storesToMap(expectedNodeStatus)
381-
382-
if a, e := len(actualStores), len(expectedStores); a != e {
383-
t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e)
384-
}
385-
for key := range expectedStores {
386-
if _, ok := actualStores[key]; !ok {
387-
t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key)
388-
}
389-
}
390-
if t.Failed() {
391-
t.FailNow()
392-
}
393-
394-
// ========================================
395-
// Ensure all metric sets (node and store level) are consistent with
396-
// expected status.
397-
// ========================================
398-
399-
// CompareMetricMaps accepts an actual and expected metric maps, along with
400-
// two lists of string keys. For metrics with keys in the 'equal' map, the
401-
// actual value must be equal to the expected value. For keys in the
402-
// 'greater' map, the actual value must be greater than or equal to the
403-
// expected value.
404-
compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) {
405-
// Make sure the actual value map contains all values in expected map.
406-
for key := range expected {
407-
if _, ok := actual[key]; !ok {
408-
t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key)
409-
}
410-
}
411-
if t.Failed() {
412-
return
413-
}
414-
415-
// For each equal key, ensure that the actual value is equal to expected
416-
// key.
417-
for _, key := range equal {
418-
if _, ok := actual[key]; !ok {
419-
t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key)
420-
continue
421-
}
422-
if a, e := actual[key], expected[key]; a != e {
423-
t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
424-
}
425-
}
426-
for _, key := range greater {
427-
if _, ok := actual[key]; !ok {
428-
t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key)
429-
continue
430-
}
431-
if a, e := actual[key], expected[key]; a < e {
432-
t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
433-
}
434-
}
435-
}
436-
437-
compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{
438-
"exec.success",
439-
"exec.error",
440-
})
441-
442-
for key := range actualStores {
443-
// Directly verify a subset of metrics which have predictable output.
444-
compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics,
445-
[]string{
446-
"replicas",
447-
"replicas.leaseholders",
448-
},
449-
[]string{
450-
"livecount",
451-
"keycount",
452-
"valcount",
453-
})
454-
}
455-
456-
if t.Failed() {
457-
t.FailNow()
458-
}
459-
460-
return nodeStatus
461-
}
462-
463343
// TestNodeEmitsDiskSlowEvents verifies that disk slow events are emitted for
464344
// each store that is slow.
465345
func TestNodeEmitsDiskSlowEvents(t *testing.T) {
@@ -612,6 +492,11 @@ func TestNodeStatusWritten(t *testing.T) {
612492
ts, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{
613493
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
614494
DisableEventLog: true,
495+
Knobs: base.TestingKnobs{
496+
Store: &kvserver.StoreTestingKnobs{
497+
DisableSplitQueue: true,
498+
},
499+
},
615500
})
616501
defer ts.Stopper().Stop(context.Background())
617502
ctx := context.Background()
@@ -625,7 +510,6 @@ func TestNodeStatusWritten(t *testing.T) {
625510
s.WaitForInit()
626511

627512
content := "junk"
628-
leftKey := "a"
629513

630514
// Scan over all keys to "wake up" all replicas (force a lease holder election).
631515
if _, err := kvDB.Scan(context.Background(), keys.MetaMax, keys.MaxKey, 0); err != nil {
@@ -648,50 +532,6 @@ func TestNodeStatusWritten(t *testing.T) {
648532
return nil
649533
})
650534

651-
// ========================================
652-
// Construct an initial expectation for NodeStatus to compare to the first
653-
// status produced by the server.
654-
// ========================================
655-
expectedNodeStatus := &statuspb.NodeStatus{
656-
Desc: ts.Node().(*Node).Descriptor,
657-
StartedAt: 0,
658-
UpdatedAt: 0,
659-
Metrics: map[string]float64{
660-
"exec.success": 0,
661-
"exec.error": 0,
662-
},
663-
}
664-
665-
expectedStoreStatuses := make(map[roachpb.StoreID]statuspb.StoreStatus)
666-
if err := ts.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error {
667-
desc, err := s.Descriptor(ctx, false /* useCached */)
668-
if err != nil {
669-
t.Fatal(err)
670-
}
671-
expectedReplicas := 0
672-
if s.StoreID() == roachpb.StoreID(1) {
673-
expectedReplicas = initialRanges
674-
}
675-
stat := statuspb.StoreStatus{
676-
Desc: *desc,
677-
Metrics: map[string]float64{
678-
"replicas": float64(expectedReplicas),
679-
"replicas.leaseholders": float64(expectedReplicas),
680-
"livebytes": 0,
681-
"keybytes": 0,
682-
"valbytes": 0,
683-
"livecount": 0,
684-
"keycount": 0,
685-
"valcount": 0,
686-
},
687-
}
688-
expectedNodeStatus.StoreStatuses = append(expectedNodeStatus.StoreStatuses, stat)
689-
expectedStoreStatuses[s.StoreID()] = stat
690-
return nil
691-
}); err != nil {
692-
t.Fatal(err)
693-
}
694-
695535
// Function to force summaries to be written synchronously, including all
696536
// data currently in the event pipeline. Only one of the stores has
697537
// replicas, so there are no concerns related to quorum writes; if there
@@ -707,82 +547,110 @@ func TestNodeStatusWritten(t *testing.T) {
707547
}
708548
}
709549

710-
// Verify initial status.
711-
forceWriteStatus()
712-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 1)
713-
for _, s := range expectedNodeStatus.StoreStatuses {
714-
expectedStoreStatuses[s.Desc.StoreID] = s
550+
// Helper to read current node status.
551+
readNodeStatus := func() *statuspb.NodeStatus {
552+
nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
553+
nodeStatus := &statuspb.NodeStatus{}
554+
if err := ts.DB().GetProto(ctx, nodeStatusKey, nodeStatus); err != nil {
555+
t.Fatalf("failure getting node status: %s", err)
556+
}
557+
return nodeStatus
558+
}
559+
560+
// Helper to get store metrics from node status.
561+
getStoreMetrics := func(ns *statuspb.NodeStatus, storeID roachpb.StoreID) map[string]float64 {
562+
for _, ss := range ns.StoreStatuses {
563+
if ss.Desc.StoreID == storeID {
564+
return ss.Metrics
565+
}
566+
}
567+
t.Fatalf("store %d not found in node status", storeID)
568+
return nil
715569
}
716570

717571
// ========================================
718-
// Put some data into the K/V store and confirm change to status.
572+
// Verify initial status has expected structure.
719573
// ========================================
574+
forceWriteStatus()
575+
initialStatus := readNodeStatus()
720576

721-
splitKey := "b"
722-
rightKey := "c"
723-
724-
// Write some values left and right of the proposed split key.
725-
if err := kvDB.Put(ctx, leftKey, content); err != nil {
726-
t.Fatal(err)
577+
// Verify node descriptor is present.
578+
expectedDesc := ts.Node().(*Node).Descriptor
579+
if !reflect.DeepEqual(initialStatus.Desc, expectedDesc) {
580+
t.Errorf("Descriptor does not match expected.\nexpected: %s\nactual: %s", &expectedDesc, &initialStatus.Desc)
727581
}
728-
if err := kvDB.Put(ctx, rightKey, content); err != nil {
729-
t.Fatal(err)
582+
583+
// Verify initial replica count.
584+
store1Metrics := getStoreMetrics(initialStatus, roachpb.StoreID(1))
585+
if int(store1Metrics["replicas"]) != initialRanges {
586+
t.Errorf("expected %d replicas, got %v", initialRanges, store1Metrics["replicas"])
730587
}
731588

732-
// Increment metrics on the node
733-
expectedNodeStatus.Metrics["exec.success"] += 2
589+
// ========================================
590+
// Put some data into the K/V store and confirm livecount increases.
591+
// ========================================
592+
// We use SucceedsSoon with unique keys each attempt to handle rare cases
593+
// where background activity (GC, intent resolution) might temporarily
594+
// decrease livecount. By writing unique keys each attempt, we guarantee
595+
// eventual progress.
596+
attempt := 0
597+
testutils.SucceedsSoon(t, func() error {
598+
attempt++
599+
forceWriteStatus()
600+
beforeStatus := readNodeStatus()
601+
beforeLivecount := getStoreMetrics(beforeStatus, roachpb.StoreID(1))["livecount"]
602+
603+
// Write unique keys each attempt to guarantee livecount increases.
604+
leftKey := fmt.Sprintf("a-attempt-%d", attempt)
605+
rightKey := fmt.Sprintf("c-attempt-%d", attempt)
606+
if err := kvDB.Put(ctx, leftKey, content); err != nil {
607+
return err
608+
}
609+
if err := kvDB.Put(ctx, rightKey, content); err != nil {
610+
return err
611+
}
734612

735-
// Increment metrics on the first store.
736-
store1 := expectedStoreStatuses[roachpb.StoreID(1)].Metrics
737-
store1["livecount"]++
738-
store1["keycount"]++
739-
store1["valcount"]++
740-
store1["livebytes"]++
741-
store1["keybytes"]++
742-
store1["valbytes"]++
613+
forceWriteStatus()
614+
afterStatus := readNodeStatus()
615+
afterLivecount := getStoreMetrics(afterStatus, roachpb.StoreID(1))["livecount"]
743616

744-
forceWriteStatus()
745-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 2)
746-
for _, s := range expectedNodeStatus.StoreStatuses {
747-
expectedStoreStatuses[s.Desc.StoreID] = s
748-
}
617+
if afterLivecount < beforeLivecount+2 {
618+
return errors.Errorf("livecount did not increase by at least 2 after writing keys: before=%v, after=%v (attempt %d)",
619+
beforeLivecount, afterLivecount, attempt)
620+
}
621+
return nil
622+
})
749623

750624
// ========================================
751-
// Perform an admin split and verify that status is updated.
625+
// Perform an admin split and verify that replica count increases.
752626
// ========================================
627+
forceWriteStatus()
628+
beforeSplitStatus := readNodeStatus()
629+
beforeReplicas := getStoreMetrics(beforeSplitStatus, roachpb.StoreID(1))["replicas"]
753630

754-
// Split the range.
755-
if err := kvDB.AdminSplit(
756-
context.Background(),
757-
splitKey,
758-
hlc.MaxTimestamp, /* expirationTime */
759-
); err != nil {
631+
// Split the range at "b".
632+
splitKey := "b"
633+
if err := kvDB.AdminSplit(ctx, splitKey, hlc.MaxTimestamp /* expirationTime */); err != nil {
760634
t.Fatal(err)
761635
}
762636

763-
// Write on both sides of the split to ensure that the raft machinery
764-
// is running.
765-
if err := kvDB.Put(ctx, leftKey, content); err != nil {
637+
// Write on both sides of the split to ensure that the raft machinery is running.
638+
if err := kvDB.Put(ctx, "a", content); err != nil {
766639
t.Fatal(err)
767640
}
768-
if err := kvDB.Put(ctx, rightKey, content); err != nil {
641+
if err := kvDB.Put(ctx, "c", content); err != nil {
769642
t.Fatal(err)
770643
}
771644

772-
// Increment metrics on the node
773-
expectedNodeStatus.Metrics["exec.success"] += 2
774-
775-
// Increment metrics on the first store.
776-
store1 = expectedStoreStatuses[roachpb.StoreID(1)].Metrics
777-
store1["replicas"]++
778-
store1["replicas.leaders"]++
779-
store1["replicas.leaseholders"]++
780-
store1["ranges"]++
781-
645+
// Verify replica count increased. Since the split queue is disabled,
646+
// the only splits are those we explicitly perform, so this is stable.
782647
forceWriteStatus()
783-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 3)
784-
for _, s := range expectedNodeStatus.StoreStatuses {
785-
expectedStoreStatuses[s.Desc.StoreID] = s
648+
afterSplitStatus := readNodeStatus()
649+
afterReplicas := getStoreMetrics(afterSplitStatus, roachpb.StoreID(1))["replicas"]
650+
651+
if afterReplicas <= beforeReplicas {
652+
t.Errorf("replica count did not increase after split: before=%v, after=%v",
653+
beforeReplicas, afterReplicas)
786654
}
787655
}
788656

0 commit comments

Comments
 (0)