storage_api: fix TestDecommissionSelf flake

tbg · tbg · commit bc1faf205768 · 2025-10-30T18:26:31.000+01:00
TestDecommissionSelf was flaking with timeouts waiting for decommissioned nodes to observe their own DECOMMISSIONED status. The test would fail when node 4 (one of the decommissioned nodes) never saw its liveness record update to DECOMMISSIONED within the 5-second timeout. The likely root cause is a race condition in how decommission status propagates: when a node is marked as decommissioned, the updated liveness record is gossiped to all nodes. However, if other nodes receive this gossip update before the decommissioned node does, they will write a tombstone to local storage and subsequently reject all RPCs from the decommissioned node, including gossip messages. This can prevent the decommissioned node from ever learning about its own status change. This commit fixes the test by only verifying the cluster state from the perspective of non-decommissioned nodes. We now assert that: 1. Active nodes see themselves as ACTIVE 2. Active nodes see the decommissioned nodes as DECOMMISSIONED We no longer attempt to verify that decommissioned nodes observe their own status, since this is not guaranteed due to the gossip/tombstone race. Fixes cockroachdb#156402. Fixes cockroachdb#156104. Fixes cockroachdb#154474. Release note: None Epic: None
diff --git a/pkg/server/storage_api/decommission_test.go b/pkg/server/storage_api/decommission_test.go
@@ -702,20 +702,39 @@ func TestDecommissionSelf(t *testing.T) {
 	require.NoError(t, err)
 	require.Empty(t, resp.Status)
 
-	// The nodes should now have been (or soon become) decommissioned.
+	// The nodes should now have been (or soon become) decommissioned. In a cruel
+	// twist of fate, the decommissioned nodes may not find out about that,
+	// however. This is because the other nodes may learn that (say) n4 is
+	// decommissioned before n4 does, and will block all communication with it,
+	// which includes receiving an updated liveness record. So we only verify that
+	// the non-decommissioned nodes see the decommissioned nodes as such, but
+	// don't verify the decommissioned nodes' own view of their (or anyone's, really)
+	// liveness.
 	for i := 0; i < tc.NumServers(); i++ {
 		srv := tc.Server(i)
-		expect := livenesspb.MembershipStatus_ACTIVE
+		var omit bool
 		for _, nodeID := range decomNodeIDs {
 			if srv.NodeID() == nodeID {
-				expect = livenesspb.MembershipStatus_DECOMMISSIONED
-				break
+				omit = true
 			}
 		}
-		require.Eventually(t, func() bool {
-			liveness, ok := srv.NodeLiveness().(*liveness.NodeLiveness).GetLiveness(srv.NodeID())
-			return ok && liveness.Membership == expect
-		}, 5*time.Second, 100*time.Millisecond, "timed out waiting for node %v status %v", i, expect)
+		if omit {
+			continue
+		}
+		nl := srv.NodeLiveness().(*liveness.NodeLiveness)
+		testutils.SucceedsSoon(t, func() error {
+			entry, ok := nl.GetLiveness(srv.NodeID())
+			if !ok || entry.Membership != livenesspb.MembershipStatus_ACTIVE {
+				return errors.Errorf("n%d not ACTIVE: %v", srv.NodeID(), entry.Membership)
+			}
+			for _, nodeID := range decomNodeIDs {
+				entry, ok := nl.GetLiveness(nodeID)
+				if !ok || entry.Membership != livenesspb.MembershipStatus_DECOMMISSIONED {
+					return errors.Errorf("n%d not DECOMMISSIONED: %v", srv.NodeID(), entry.Membership)
+				}
+			}
+			return nil
+		})
 	}
 }