Skip to content

Commit f519a08

Browse files
authored
Merge pull request #159889 from tbg/blathers/backport-release-26.1-159696
release-26.1: server: make TestNodeStatusWritten robust against background activity
2 parents 9d0571b + b4a64c1 commit f519a08

File tree

1 file changed

+86
-218
lines changed

1 file changed

+86
-218
lines changed

pkg/server/node_test.go

Lines changed: 86 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -340,126 +340,6 @@ func TestCorruptedClusterID(t *testing.T) {
340340
}
341341
}
342342

343-
// compareNodeStatus ensures that the actual node status for the passed in
344-
// node is updated correctly. It checks that the Node Descriptor, StoreIDs,
345-
// RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
346-
// the bytes and counts for Live, Key and Val are at least the expected value.
347-
// And that UpdatedAt has increased.
348-
// The latest actual stats are returned.
349-
func compareNodeStatus(
350-
t *testing.T,
351-
ts serverutils.TestServerInterface,
352-
expectedNodeStatus *statuspb.NodeStatus,
353-
testNumber int,
354-
) *statuspb.NodeStatus {
355-
// ========================================
356-
// Read NodeStatus from server and validate top-level fields.
357-
// ========================================
358-
nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
359-
nodeStatus := &statuspb.NodeStatus{}
360-
if err := ts.DB().GetProto(context.Background(), nodeStatusKey, nodeStatus); err != nil {
361-
t.Fatalf("%d: failure getting node status: %s", testNumber, err)
362-
}
363-
364-
// Descriptor values should be exactly equal to expected.
365-
if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) {
366-
t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, &e, &a)
367-
}
368-
369-
// ========================================
370-
// Ensure all expected stores are represented in the node status.
371-
// ========================================
372-
storesToMap := func(ns *statuspb.NodeStatus) map[roachpb.StoreID]statuspb.StoreStatus {
373-
strMap := make(map[roachpb.StoreID]statuspb.StoreStatus, len(ns.StoreStatuses))
374-
for _, str := range ns.StoreStatuses {
375-
strMap[str.Desc.StoreID] = str
376-
}
377-
return strMap
378-
}
379-
actualStores := storesToMap(nodeStatus)
380-
expectedStores := storesToMap(expectedNodeStatus)
381-
382-
if a, e := len(actualStores), len(expectedStores); a != e {
383-
t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e)
384-
}
385-
for key := range expectedStores {
386-
if _, ok := actualStores[key]; !ok {
387-
t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key)
388-
}
389-
}
390-
if t.Failed() {
391-
t.FailNow()
392-
}
393-
394-
// ========================================
395-
// Ensure all metric sets (node and store level) are consistent with
396-
// expected status.
397-
// ========================================
398-
399-
// CompareMetricMaps accepts an actual and expected metric maps, along with
400-
// two lists of string keys. For metrics with keys in the 'equal' map, the
401-
// actual value must be equal to the expected value. For keys in the
402-
// 'greater' map, the actual value must be greater than or equal to the
403-
// expected value.
404-
compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) {
405-
// Make sure the actual value map contains all values in expected map.
406-
for key := range expected {
407-
if _, ok := actual[key]; !ok {
408-
t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key)
409-
}
410-
}
411-
if t.Failed() {
412-
return
413-
}
414-
415-
// For each equal key, ensure that the actual value is equal to expected
416-
// key.
417-
for _, key := range equal {
418-
if _, ok := actual[key]; !ok {
419-
t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key)
420-
continue
421-
}
422-
if a, e := actual[key], expected[key]; a != e {
423-
t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
424-
}
425-
}
426-
for _, key := range greater {
427-
if _, ok := actual[key]; !ok {
428-
t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key)
429-
continue
430-
}
431-
if a, e := actual[key], expected[key]; a < e {
432-
t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a)
433-
}
434-
}
435-
}
436-
437-
compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{
438-
"exec.success",
439-
"exec.error",
440-
})
441-
442-
for key := range actualStores {
443-
// Directly verify a subset of metrics which have predictable output.
444-
compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics,
445-
[]string{
446-
"replicas",
447-
"replicas.leaseholders",
448-
},
449-
[]string{
450-
"livecount",
451-
"keycount",
452-
"valcount",
453-
})
454-
}
455-
456-
if t.Failed() {
457-
t.FailNow()
458-
}
459-
460-
return nodeStatus
461-
}
462-
463343
// TestNodeEmitsDiskSlowEvents verifies that disk slow events are emitted for
464344
// each store that is slow.
465345
func TestNodeEmitsDiskSlowEvents(t *testing.T) {
@@ -612,6 +492,11 @@ func TestNodeStatusWritten(t *testing.T) {
612492
ts, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{
613493
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
614494
DisableEventLog: true,
495+
Knobs: base.TestingKnobs{
496+
Store: &kvserver.StoreTestingKnobs{
497+
DisableSplitQueue: true,
498+
},
499+
},
615500
})
616501
defer ts.Stopper().Stop(context.Background())
617502
ctx := context.Background()
@@ -625,7 +510,6 @@ func TestNodeStatusWritten(t *testing.T) {
625510
s.WaitForInit()
626511

627512
content := "junk"
628-
leftKey := "a"
629513

630514
// Scan over all keys to "wake up" all replicas (force a lease holder election).
631515
if _, err := kvDB.Scan(context.Background(), keys.MetaMax, keys.MaxKey, 0); err != nil {
@@ -648,50 +532,6 @@ func TestNodeStatusWritten(t *testing.T) {
648532
return nil
649533
})
650534

651-
// ========================================
652-
// Construct an initial expectation for NodeStatus to compare to the first
653-
// status produced by the server.
654-
// ========================================
655-
expectedNodeStatus := &statuspb.NodeStatus{
656-
Desc: ts.Node().(*Node).Descriptor,
657-
StartedAt: 0,
658-
UpdatedAt: 0,
659-
Metrics: map[string]float64{
660-
"exec.success": 0,
661-
"exec.error": 0,
662-
},
663-
}
664-
665-
expectedStoreStatuses := make(map[roachpb.StoreID]statuspb.StoreStatus)
666-
if err := ts.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error {
667-
desc, err := s.Descriptor(ctx, false /* useCached */)
668-
if err != nil {
669-
t.Fatal(err)
670-
}
671-
expectedReplicas := 0
672-
if s.StoreID() == roachpb.StoreID(1) {
673-
expectedReplicas = initialRanges
674-
}
675-
stat := statuspb.StoreStatus{
676-
Desc: *desc,
677-
Metrics: map[string]float64{
678-
"replicas": float64(expectedReplicas),
679-
"replicas.leaseholders": float64(expectedReplicas),
680-
"livebytes": 0,
681-
"keybytes": 0,
682-
"valbytes": 0,
683-
"livecount": 0,
684-
"keycount": 0,
685-
"valcount": 0,
686-
},
687-
}
688-
expectedNodeStatus.StoreStatuses = append(expectedNodeStatus.StoreStatuses, stat)
689-
expectedStoreStatuses[s.StoreID()] = stat
690-
return nil
691-
}); err != nil {
692-
t.Fatal(err)
693-
}
694-
695535
// Function to force summaries to be written synchronously, including all
696536
// data currently in the event pipeline. Only one of the stores has
697537
// replicas, so there are no concerns related to quorum writes; if there
@@ -707,82 +547,110 @@ func TestNodeStatusWritten(t *testing.T) {
707547
}
708548
}
709549

710-
// Verify initial status.
711-
forceWriteStatus()
712-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 1)
713-
for _, s := range expectedNodeStatus.StoreStatuses {
714-
expectedStoreStatuses[s.Desc.StoreID] = s
550+
// Helper to read current node status.
551+
readNodeStatus := func() *statuspb.NodeStatus {
552+
nodeStatusKey := keys.NodeStatusKey(ts.NodeID())
553+
nodeStatus := &statuspb.NodeStatus{}
554+
if err := ts.DB().GetProto(ctx, nodeStatusKey, nodeStatus); err != nil {
555+
t.Fatalf("failure getting node status: %s", err)
556+
}
557+
return nodeStatus
558+
}
559+
560+
// Helper to get store metrics from node status.
561+
getStoreMetrics := func(ns *statuspb.NodeStatus, storeID roachpb.StoreID) map[string]float64 {
562+
for _, ss := range ns.StoreStatuses {
563+
if ss.Desc.StoreID == storeID {
564+
return ss.Metrics
565+
}
566+
}
567+
t.Fatalf("store %d not found in node status", storeID)
568+
return nil
715569
}
716570

717571
// ========================================
718-
// Put some data into the K/V store and confirm change to status.
572+
// Verify initial status has expected structure.
719573
// ========================================
574+
forceWriteStatus()
575+
initialStatus := readNodeStatus()
720576

721-
splitKey := "b"
722-
rightKey := "c"
723-
724-
// Write some values left and right of the proposed split key.
725-
if err := kvDB.Put(ctx, leftKey, content); err != nil {
726-
t.Fatal(err)
577+
// Verify node descriptor is present.
578+
expectedDesc := ts.Node().(*Node).Descriptor
579+
if !reflect.DeepEqual(initialStatus.Desc, expectedDesc) {
580+
t.Errorf("Descriptor does not match expected.\nexpected: %s\nactual: %s", &expectedDesc, &initialStatus.Desc)
727581
}
728-
if err := kvDB.Put(ctx, rightKey, content); err != nil {
729-
t.Fatal(err)
582+
583+
// Verify initial replica count.
584+
store1Metrics := getStoreMetrics(initialStatus, roachpb.StoreID(1))
585+
if int(store1Metrics["replicas"]) != initialRanges {
586+
t.Errorf("expected %d replicas, got %v", initialRanges, store1Metrics["replicas"])
730587
}
731588

732-
// Increment metrics on the node
733-
expectedNodeStatus.Metrics["exec.success"] += 2
589+
// ========================================
590+
// Put some data into the K/V store and confirm livecount increases.
591+
// ========================================
592+
// We use SucceedsSoon with unique keys each attempt to handle rare cases
593+
// where background activity (GC, intent resolution) might temporarily
594+
// decrease livecount. By writing unique keys each attempt, we guarantee
595+
// eventual progress.
596+
attempt := 0
597+
testutils.SucceedsSoon(t, func() error {
598+
attempt++
599+
forceWriteStatus()
600+
beforeStatus := readNodeStatus()
601+
beforeLivecount := getStoreMetrics(beforeStatus, roachpb.StoreID(1))["livecount"]
602+
603+
// Write unique keys each attempt to guarantee livecount increases.
604+
leftKey := fmt.Sprintf("a-attempt-%d", attempt)
605+
rightKey := fmt.Sprintf("c-attempt-%d", attempt)
606+
if err := kvDB.Put(ctx, leftKey, content); err != nil {
607+
return err
608+
}
609+
if err := kvDB.Put(ctx, rightKey, content); err != nil {
610+
return err
611+
}
734612

735-
// Increment metrics on the first store.
736-
store1 := expectedStoreStatuses[roachpb.StoreID(1)].Metrics
737-
store1["livecount"]++
738-
store1["keycount"]++
739-
store1["valcount"]++
740-
store1["livebytes"]++
741-
store1["keybytes"]++
742-
store1["valbytes"]++
613+
forceWriteStatus()
614+
afterStatus := readNodeStatus()
615+
afterLivecount := getStoreMetrics(afterStatus, roachpb.StoreID(1))["livecount"]
743616

744-
forceWriteStatus()
745-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 2)
746-
for _, s := range expectedNodeStatus.StoreStatuses {
747-
expectedStoreStatuses[s.Desc.StoreID] = s
748-
}
617+
if afterLivecount < beforeLivecount+2 {
618+
return errors.Errorf("livecount did not increase by at least 2 after writing keys: before=%v, after=%v (attempt %d)",
619+
beforeLivecount, afterLivecount, attempt)
620+
}
621+
return nil
622+
})
749623

750624
// ========================================
751-
// Perform an admin split and verify that status is updated.
625+
// Perform an admin split and verify that replica count increases.
752626
// ========================================
627+
forceWriteStatus()
628+
beforeSplitStatus := readNodeStatus()
629+
beforeReplicas := getStoreMetrics(beforeSplitStatus, roachpb.StoreID(1))["replicas"]
753630

754-
// Split the range.
755-
if err := kvDB.AdminSplit(
756-
context.Background(),
757-
splitKey,
758-
hlc.MaxTimestamp, /* expirationTime */
759-
); err != nil {
631+
// Split the range at "b".
632+
splitKey := "b"
633+
if err := kvDB.AdminSplit(ctx, splitKey, hlc.MaxTimestamp /* expirationTime */); err != nil {
760634
t.Fatal(err)
761635
}
762636

763-
// Write on both sides of the split to ensure that the raft machinery
764-
// is running.
765-
if err := kvDB.Put(ctx, leftKey, content); err != nil {
637+
// Write on both sides of the split to ensure that the raft machinery is running.
638+
if err := kvDB.Put(ctx, "a", content); err != nil {
766639
t.Fatal(err)
767640
}
768-
if err := kvDB.Put(ctx, rightKey, content); err != nil {
641+
if err := kvDB.Put(ctx, "c", content); err != nil {
769642
t.Fatal(err)
770643
}
771644

772-
// Increment metrics on the node
773-
expectedNodeStatus.Metrics["exec.success"] += 2
774-
775-
// Increment metrics on the first store.
776-
store1 = expectedStoreStatuses[roachpb.StoreID(1)].Metrics
777-
store1["replicas"]++
778-
store1["replicas.leaders"]++
779-
store1["replicas.leaseholders"]++
780-
store1["ranges"]++
781-
645+
// Verify replica count increased. Since the split queue is disabled,
646+
// the only splits are those we explicitly perform, so this is stable.
782647
forceWriteStatus()
783-
expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 3)
784-
for _, s := range expectedNodeStatus.StoreStatuses {
785-
expectedStoreStatuses[s.Desc.StoreID] = s
648+
afterSplitStatus := readNodeStatus()
649+
afterReplicas := getStoreMetrics(afterSplitStatus, roachpb.StoreID(1))["replicas"]
650+
651+
if afterReplicas <= beforeReplicas {
652+
t.Errorf("replica count did not increase after split: before=%v, after=%v",
653+
beforeReplicas, afterReplicas)
786654
}
787655
}
788656

0 commit comments

Comments
 (0)