@@ -340,126 +340,6 @@ func TestCorruptedClusterID(t *testing.T) {
340340 }
341341}
342342
343- // compareNodeStatus ensures that the actual node status for the passed in
344- // node is updated correctly. It checks that the Node Descriptor, StoreIDs,
345- // RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that
346- // the bytes and counts for Live, Key and Val are at least the expected value.
347- // And that UpdatedAt has increased.
348- // The latest actual stats are returned.
349- func compareNodeStatus (
350- t * testing.T ,
351- ts serverutils.TestServerInterface ,
352- expectedNodeStatus * statuspb.NodeStatus ,
353- testNumber int ,
354- ) * statuspb.NodeStatus {
355- // ========================================
356- // Read NodeStatus from server and validate top-level fields.
357- // ========================================
358- nodeStatusKey := keys .NodeStatusKey (ts .NodeID ())
359- nodeStatus := & statuspb.NodeStatus {}
360- if err := ts .DB ().GetProto (context .Background (), nodeStatusKey , nodeStatus ); err != nil {
361- t .Fatalf ("%d: failure getting node status: %s" , testNumber , err )
362- }
363-
364- // Descriptor values should be exactly equal to expected.
365- if a , e := nodeStatus .Desc , expectedNodeStatus .Desc ; ! reflect .DeepEqual (a , e ) {
366- t .Errorf ("%d: Descriptor does not match expected.\n expected: %s\n actual: %s" , testNumber , & e , & a )
367- }
368-
369- // ========================================
370- // Ensure all expected stores are represented in the node status.
371- // ========================================
372- storesToMap := func (ns * statuspb.NodeStatus ) map [roachpb.StoreID ]statuspb.StoreStatus {
373- strMap := make (map [roachpb.StoreID ]statuspb.StoreStatus , len (ns .StoreStatuses ))
374- for _ , str := range ns .StoreStatuses {
375- strMap [str .Desc .StoreID ] = str
376- }
377- return strMap
378- }
379- actualStores := storesToMap (nodeStatus )
380- expectedStores := storesToMap (expectedNodeStatus )
381-
382- if a , e := len (actualStores ), len (expectedStores ); a != e {
383- t .Errorf ("%d: actual status contained %d stores, expected %d" , testNumber , a , e )
384- }
385- for key := range expectedStores {
386- if _ , ok := actualStores [key ]; ! ok {
387- t .Errorf ("%d: actual node status did not contain expected store %d" , testNumber , key )
388- }
389- }
390- if t .Failed () {
391- t .FailNow ()
392- }
393-
394- // ========================================
395- // Ensure all metric sets (node and store level) are consistent with
396- // expected status.
397- // ========================================
398-
399- // CompareMetricMaps accepts an actual and expected metric maps, along with
400- // two lists of string keys. For metrics with keys in the 'equal' map, the
401- // actual value must be equal to the expected value. For keys in the
402- // 'greater' map, the actual value must be greater than or equal to the
403- // expected value.
404- compareMetricMaps := func (actual , expected map [string ]float64 , equal , greater []string ) {
405- // Make sure the actual value map contains all values in expected map.
406- for key := range expected {
407- if _ , ok := actual [key ]; ! ok {
408- t .Errorf ("%d: actual node status did not contain expected metric %s" , testNumber , key )
409- }
410- }
411- if t .Failed () {
412- return
413- }
414-
415- // For each equal key, ensure that the actual value is equal to expected
416- // key.
417- for _ , key := range equal {
418- if _ , ok := actual [key ]; ! ok {
419- t .Errorf ("%d, actual node status did not contain expected 'equal' metric key %s" , testNumber , key )
420- continue
421- }
422- if a , e := actual [key ], expected [key ]; a != e {
423- t .Errorf ("%d: %s does not match expected value.\n Expected %f, Actual %f" , testNumber , key , e , a )
424- }
425- }
426- for _ , key := range greater {
427- if _ , ok := actual [key ]; ! ok {
428- t .Errorf ("%d: actual node status did not contain expected 'greater' metric key %s" , testNumber , key )
429- continue
430- }
431- if a , e := actual [key ], expected [key ]; a < e {
432- t .Errorf ("%d: %s is not greater than or equal to expected value.\n Expected %f, Actual %f" , testNumber , key , e , a )
433- }
434- }
435- }
436-
437- compareMetricMaps (nodeStatus .Metrics , expectedNodeStatus .Metrics , nil , []string {
438- "exec.success" ,
439- "exec.error" ,
440- })
441-
442- for key := range actualStores {
443- // Directly verify a subset of metrics which have predictable output.
444- compareMetricMaps (actualStores [key ].Metrics , expectedStores [key ].Metrics ,
445- []string {
446- "replicas" ,
447- "replicas.leaseholders" ,
448- },
449- []string {
450- "livecount" ,
451- "keycount" ,
452- "valcount" ,
453- })
454- }
455-
456- if t .Failed () {
457- t .FailNow ()
458- }
459-
460- return nodeStatus
461- }
462-
463343// TestNodeEmitsDiskSlowEvents verifies that disk slow events are emitted for
464344// each store that is slow.
465345func TestNodeEmitsDiskSlowEvents (t * testing.T ) {
@@ -612,6 +492,11 @@ func TestNodeStatusWritten(t *testing.T) {
612492 ts , _ , kvDB := serverutils .StartServer (t , base.TestServerArgs {
613493 DefaultTestTenant : base .TestIsSpecificToStorageLayerAndNeedsASystemTenant ,
614494 DisableEventLog : true ,
495+ Knobs : base.TestingKnobs {
496+ Store : & kvserver.StoreTestingKnobs {
497+ DisableSplitQueue : true ,
498+ },
499+ },
615500 })
616501 defer ts .Stopper ().Stop (context .Background ())
617502 ctx := context .Background ()
@@ -625,7 +510,6 @@ func TestNodeStatusWritten(t *testing.T) {
625510 s .WaitForInit ()
626511
627512 content := "junk"
628- leftKey := "a"
629513
630514 // Scan over all keys to "wake up" all replicas (force a lease holder election).
631515 if _ , err := kvDB .Scan (context .Background (), keys .MetaMax , keys .MaxKey , 0 ); err != nil {
@@ -648,50 +532,6 @@ func TestNodeStatusWritten(t *testing.T) {
648532 return nil
649533 })
650534
651- // ========================================
652- // Construct an initial expectation for NodeStatus to compare to the first
653- // status produced by the server.
654- // ========================================
655- expectedNodeStatus := & statuspb.NodeStatus {
656- Desc : ts .Node ().(* Node ).Descriptor ,
657- StartedAt : 0 ,
658- UpdatedAt : 0 ,
659- Metrics : map [string ]float64 {
660- "exec.success" : 0 ,
661- "exec.error" : 0 ,
662- },
663- }
664-
665- expectedStoreStatuses := make (map [roachpb.StoreID ]statuspb.StoreStatus )
666- if err := ts .GetStores ().(* kvserver.Stores ).VisitStores (func (s * kvserver.Store ) error {
667- desc , err := s .Descriptor (ctx , false /* useCached */ )
668- if err != nil {
669- t .Fatal (err )
670- }
671- expectedReplicas := 0
672- if s .StoreID () == roachpb .StoreID (1 ) {
673- expectedReplicas = initialRanges
674- }
675- stat := statuspb.StoreStatus {
676- Desc : * desc ,
677- Metrics : map [string ]float64 {
678- "replicas" : float64 (expectedReplicas ),
679- "replicas.leaseholders" : float64 (expectedReplicas ),
680- "livebytes" : 0 ,
681- "keybytes" : 0 ,
682- "valbytes" : 0 ,
683- "livecount" : 0 ,
684- "keycount" : 0 ,
685- "valcount" : 0 ,
686- },
687- }
688- expectedNodeStatus .StoreStatuses = append (expectedNodeStatus .StoreStatuses , stat )
689- expectedStoreStatuses [s .StoreID ()] = stat
690- return nil
691- }); err != nil {
692- t .Fatal (err )
693- }
694-
695535 // Function to force summaries to be written synchronously, including all
696536 // data currently in the event pipeline. Only one of the stores has
697537 // replicas, so there are no concerns related to quorum writes; if there
@@ -707,82 +547,110 @@ func TestNodeStatusWritten(t *testing.T) {
707547 }
708548 }
709549
710- // Verify initial status.
711- forceWriteStatus ()
712- expectedNodeStatus = compareNodeStatus (t , ts , expectedNodeStatus , 1 )
713- for _ , s := range expectedNodeStatus .StoreStatuses {
714- expectedStoreStatuses [s .Desc .StoreID ] = s
550+ // Helper to read current node status.
551+ readNodeStatus := func () * statuspb.NodeStatus {
552+ nodeStatusKey := keys .NodeStatusKey (ts .NodeID ())
553+ nodeStatus := & statuspb.NodeStatus {}
554+ if err := ts .DB ().GetProto (ctx , nodeStatusKey , nodeStatus ); err != nil {
555+ t .Fatalf ("failure getting node status: %s" , err )
556+ }
557+ return nodeStatus
558+ }
559+
560+ // Helper to get store metrics from node status.
561+ getStoreMetrics := func (ns * statuspb.NodeStatus , storeID roachpb.StoreID ) map [string ]float64 {
562+ for _ , ss := range ns .StoreStatuses {
563+ if ss .Desc .StoreID == storeID {
564+ return ss .Metrics
565+ }
566+ }
567+ t .Fatalf ("store %d not found in node status" , storeID )
568+ return nil
715569 }
716570
717571 // ========================================
718- // Put some data into the K/V store and confirm change to status .
572+ // Verify initial status has expected structure .
719573 // ========================================
574+ forceWriteStatus ()
575+ initialStatus := readNodeStatus ()
720576
721- splitKey := "b"
722- rightKey := "c"
723-
724- // Write some values left and right of the proposed split key.
725- if err := kvDB .Put (ctx , leftKey , content ); err != nil {
726- t .Fatal (err )
577+ // Verify node descriptor is present.
578+ expectedDesc := ts .Node ().(* Node ).Descriptor
579+ if ! reflect .DeepEqual (initialStatus .Desc , expectedDesc ) {
580+ t .Errorf ("Descriptor does not match expected.\n expected: %s\n actual: %s" , & expectedDesc , & initialStatus .Desc )
727581 }
728- if err := kvDB .Put (ctx , rightKey , content ); err != nil {
729- t .Fatal (err )
582+
583+ // Verify initial replica count.
584+ store1Metrics := getStoreMetrics (initialStatus , roachpb .StoreID (1 ))
585+ if int (store1Metrics ["replicas" ]) != initialRanges {
586+ t .Errorf ("expected %d replicas, got %v" , initialRanges , store1Metrics ["replicas" ])
730587 }
731588
732- // Increment metrics on the node
733- expectedNodeStatus .Metrics ["exec.success" ] += 2
589+ // ========================================
590+ // Put some data into the K/V store and confirm livecount increases.
591+ // ========================================
592+ // We use SucceedsSoon with unique keys each attempt to handle rare cases
593+ // where background activity (GC, intent resolution) might temporarily
594+ // decrease livecount. By writing unique keys each attempt, we guarantee
595+ // eventual progress.
596+ attempt := 0
597+ testutils .SucceedsSoon (t , func () error {
598+ attempt ++
599+ forceWriteStatus ()
600+ beforeStatus := readNodeStatus ()
601+ beforeLivecount := getStoreMetrics (beforeStatus , roachpb .StoreID (1 ))["livecount" ]
602+
603+ // Write unique keys each attempt to guarantee livecount increases.
604+ leftKey := fmt .Sprintf ("a-attempt-%d" , attempt )
605+ rightKey := fmt .Sprintf ("c-attempt-%d" , attempt )
606+ if err := kvDB .Put (ctx , leftKey , content ); err != nil {
607+ return err
608+ }
609+ if err := kvDB .Put (ctx , rightKey , content ); err != nil {
610+ return err
611+ }
734612
735- // Increment metrics on the first store.
736- store1 := expectedStoreStatuses [roachpb .StoreID (1 )].Metrics
737- store1 ["livecount" ]++
738- store1 ["keycount" ]++
739- store1 ["valcount" ]++
740- store1 ["livebytes" ]++
741- store1 ["keybytes" ]++
742- store1 ["valbytes" ]++
613+ forceWriteStatus ()
614+ afterStatus := readNodeStatus ()
615+ afterLivecount := getStoreMetrics (afterStatus , roachpb .StoreID (1 ))["livecount" ]
743616
744- forceWriteStatus ()
745- expectedNodeStatus = compareNodeStatus (t , ts , expectedNodeStatus , 2 )
746- for _ , s := range expectedNodeStatus .StoreStatuses {
747- expectedStoreStatuses [s .Desc .StoreID ] = s
748- }
617+ if afterLivecount < beforeLivecount + 2 {
618+ return errors .Errorf ("livecount did not increase by at least 2 after writing keys: before=%v, after=%v (attempt %d)" ,
619+ beforeLivecount , afterLivecount , attempt )
620+ }
621+ return nil
622+ })
749623
750624 // ========================================
751- // Perform an admin split and verify that status is updated .
625+ // Perform an admin split and verify that replica count increases .
752626 // ========================================
627+ forceWriteStatus ()
628+ beforeSplitStatus := readNodeStatus ()
629+ beforeReplicas := getStoreMetrics (beforeSplitStatus , roachpb .StoreID (1 ))["replicas" ]
753630
754- // Split the range.
755- if err := kvDB .AdminSplit (
756- context .Background (),
757- splitKey ,
758- hlc .MaxTimestamp , /* expirationTime */
759- ); err != nil {
631+ // Split the range at "b".
632+ splitKey := "b"
633+ if err := kvDB .AdminSplit (ctx , splitKey , hlc .MaxTimestamp /* expirationTime */ ); err != nil {
760634 t .Fatal (err )
761635 }
762636
763- // Write on both sides of the split to ensure that the raft machinery
764- // is running.
765- if err := kvDB .Put (ctx , leftKey , content ); err != nil {
637+ // Write on both sides of the split to ensure that the raft machinery is running.
638+ if err := kvDB .Put (ctx , "a" , content ); err != nil {
766639 t .Fatal (err )
767640 }
768- if err := kvDB .Put (ctx , rightKey , content ); err != nil {
641+ if err := kvDB .Put (ctx , "c" , content ); err != nil {
769642 t .Fatal (err )
770643 }
771644
772- // Increment metrics on the node
773- expectedNodeStatus .Metrics ["exec.success" ] += 2
774-
775- // Increment metrics on the first store.
776- store1 = expectedStoreStatuses [roachpb .StoreID (1 )].Metrics
777- store1 ["replicas" ]++
778- store1 ["replicas.leaders" ]++
779- store1 ["replicas.leaseholders" ]++
780- store1 ["ranges" ]++
781-
645+ // Verify replica count increased. Since the split queue is disabled,
646+ // the only splits are those we explicitly perform, so this is stable.
782647 forceWriteStatus ()
783- expectedNodeStatus = compareNodeStatus (t , ts , expectedNodeStatus , 3 )
784- for _ , s := range expectedNodeStatus .StoreStatuses {
785- expectedStoreStatuses [s .Desc .StoreID ] = s
648+ afterSplitStatus := readNodeStatus ()
649+ afterReplicas := getStoreMetrics (afterSplitStatus , roachpb .StoreID (1 ))["replicas" ]
650+
651+ if afterReplicas <= beforeReplicas {
652+ t .Errorf ("replica count did not increase after split: before=%v, after=%v" ,
653+ beforeReplicas , afterReplicas )
786654 }
787655}
788656
0 commit comments