cockroachdb
diff --git a/‎pkg/base/test_server_args.go‎
Lines changed: 8 additions & 1 deletion b/‎pkg/base/test_server_args.go‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎pkg/bench/rttanalysis/BUILD.bazel‎
Lines changed: 4 additions & 1 deletion b/‎pkg/bench/rttanalysis/BUILD.bazel‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pkg/bench/rttanalysis/registry.go‎
Lines changed: 63 additions & 6 deletions b/‎pkg/bench/rttanalysis/registry.go‎
Lines changed: 63 additions & 6 deletions
diff --git a/‎pkg/bench/rttanalysis/validate_benchmark_data_test.go‎
Lines changed: 36 additions & 5 deletions b/‎pkg/bench/rttanalysis/validate_benchmark_data_test.go‎
Lines changed: 36 additions & 5 deletions
diff --git a/‎pkg/cli/debug_recover_loss_of_quorum_test.go‎
Lines changed: 6 additions & 2 deletions b/‎pkg/cli/debug_recover_loss_of_quorum_test.go‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pkg/cli/testdata/zip/partial1‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cli/testdata/zip/partial1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cli/testdata/zip/partial1_excluded‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cli/testdata/zip/partial1_excluded‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cli/testdata/zip/partial2‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cli/testdata/zip/partial2‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cli/testdata/zip/testzip_concurrent‎
Lines changed: 1 addition & 1 deletion b/‎pkg/cli/testdata/zip/testzip_concurrent‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/cli/zip_test.go‎
Lines changed: 40 additions & 30 deletions b/‎pkg/cli/zip_test.go‎
Lines changed: 40 additions & 30 deletions
@@ -122,7 +122,14 @@ type TestServerArgs struct {
 	UseDatabase string
 
 	// If set, this will be configured in the test server to check connections
-	// from other test servers and to report in the SQL introspection.
+	// from other test servers and to report in the SQL introspection. It is
+	// advised to make the name sufficiently unique, in order to prevent a
+	// TestCluster from accidentally getting messages from unrelated clusters in
+	// the same environment that used the same TCP ports recently (e.g. see
+	// https://github.com/cockroachdb/cockroach/issues/157838).
+	//
+	// If empty (most cases), a unique ClusterName is generated automatically, or
+	// a higher-level default is used (e.g. taken from TestClusterArgs).
 	ClusterName string
 
 	// Stopper can be used to stop the server. If not set, a stopper will be
 
@@ -13,6 +13,8 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/base",
+        "//pkg/jobs",
+        "//pkg/jobs/jobspb",
         "//pkg/kv/kvclient/kvcoord",
         "//pkg/sql",
         "//pkg/sql/parser",
@@ -56,9 +58,9 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":rttanalysis"],
     exec_properties = {"test.Pool": "large"},
+    shard_count = 4,
     deps = [
         "//pkg/base",
-        "//pkg/jobs",
         "//pkg/jobs/jobspb",
         "//pkg/security/securityassets",
         "//pkg/security/securitytest",
@@ -70,6 +72,7 @@ go_test(
         "//pkg/testutils/serverutils",
         "//pkg/testutils/skip",
         "//pkg/testutils/testcluster",
+        "//pkg/util/envutil",
         "//pkg/util/protoutil",
         "//pkg/util/randutil",
     ],
 
@@ -6,9 +6,12 @@
 package rttanalysis
 
 import (
+	"runtime"
 	"strings"
 	"testing"
 
+	"github.com/cockroachdb/cockroach/pkg/jobs"
+	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
 	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
@@ -51,15 +54,69 @@ func (r *Registry) Run(b *testing.B) {
 // benchmarks can be filtered by passing the usual test filters underneath
 // this test's name.
 //
-// It takes a long time and thus is skipped under stress, race
-// and short.
+// It takes a long time and thus is skipped under duress and short.
 func (r *Registry) RunExpectations(t *testing.T) {
-	skip.UnderStress(t)
-	skip.UnderRace(t)
+	r.RunExpectationsSharded(t, 1, 1)
+}
+
+// RunExpectationsSharded runs all the benchmarks for one iteration
+// and validates that the number of RPCs meets the expectation. If run
+// with the --rewrite flag, it will rewrite the run benchmarks. The
+// benchmarks can be filtered by passing the usual test filters underneath
+// this test's name.
+//
+// It takes a long time and thus is skipped under duress and short.
+//
+// When shard and totalShards are provided (> 1), only a subset of benchmarks
+// assigned to the specific shard will be run, enabling parallel execution.
+// Test groups are distributed across shards using round-robin assignment.
+func (r *Registry) RunExpectationsSharded(t *testing.T, shard, totalShards int) {
+	defer jobs.TestingSetIDsToIgnore(map[jobspb.JobID]struct{}{3001: {}, 3002: {}})()
+	skip.UnderDuress(t)
 	skip.UnderShort(t)
-	skip.UnderDeadlock(t)
+	if runtime.GOARCH == "s390x" {
+		skip.IgnoreLint(t, "test prone to crashing under s390x (see #154317)")
+	}
+
+	// If totalShards is 1, run all tests; otherwise shard them
+	var registryToUse *Registry
+	if totalShards <= 1 {
+		// Run all test groups
+		registryToUse = r
+	} else {
+		// Create a registry with only the test groups assigned to this shard
+		shardRegistry := &Registry{
+			numNodes: r.numNodes,
+			cc:       r.cc,
+			r:        make(map[string][]RoundTripBenchTestCase),
+		}
+
+		// Distribute test groups across shards using round-robin assignment
+		// First, get all group names and sort them for consistent ordering
+		groupNames := make([]string, 0, len(r.r))
+		for groupName := range r.r {
+			groupNames = append(groupNames, groupName)
+		}
+		// Sort for deterministic assignment across runs
+		for i := 0; i < len(groupNames); i++ {
+			for j := i + 1; j < len(groupNames); j++ {
+				if groupNames[i] > groupNames[j] {
+					groupNames[i], groupNames[j] = groupNames[j], groupNames[i]
+				}
+			}
+		}
+
+		// Assign groups to shards using round-robin
+		for i, groupName := range groupNames {
+			assignedShard := (i % totalShards) + 1
+			if assignedShard == shard {
+				shardRegistry.r[groupName] = r.r[groupName]
+			}
+		}
+		registryToUse = shardRegistry
+	}
 
-	runBenchmarkExpectationTests(t, r)
+	runBenchmarkExpectationTests(t, registryToUse)
 }
 
 // Register registers a set of test cases to a given benchmark name. It is
 
@@ -6,13 +6,44 @@
 package rttanalysis
 
 import (
+	"strconv"
 	"testing"
 
-	"github.com/cockroachdb/cockroach/pkg/jobs"
-	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
+	"github.com/cockroachdb/cockroach/pkg/util/envutil"
 )
 
-func TestBenchmarkExpectation(t *testing.T) {
-	defer jobs.TestingSetIDsToIgnore(map[jobspb.JobID]struct{}{3001: {}, 3002: {}})()
-	reg.RunExpectations(t)
+// NOTE: If you change the number of shards, you must also update the
+// shard_count in BUILD.bazel to match.
+const shardCount = 4
+
+// Validate that shardCount matches TEST_TOTAL_SHARDS environment variable at init time
+var _ = func() int {
+	totalShardsStr, found := envutil.ExternalEnvString("TEST_TOTAL_SHARDS", 1)
+	if totalShardsStr == "" || !found {
+		return 0
+	}
+	totalShards, err := strconv.Atoi(totalShardsStr)
+	if err != nil {
+		return 0
+	}
+	if totalShards != shardCount {
+		panic("shardCount mismatch: update shard_count in pkg/bench/rttanalysis/BUILD.bazel to match shardCount constant")
+	}
+	return 0
+}()
+
+func TestBenchmarkExpectationShard1(t *testing.T) {
+	reg.RunExpectationsSharded(t, 1, shardCount)
+}
+
+func TestBenchmarkExpectationShard2(t *testing.T) {
+	reg.RunExpectationsSharded(t, 2, shardCount)
+}
+
+func TestBenchmarkExpectationShard3(t *testing.T) {
+	reg.RunExpectationsSharded(t, 3, shardCount)
+}
+
+func TestBenchmarkExpectationShard4(t *testing.T) {
+	reg.RunExpectationsSharded(t, 4, shardCount)
 }
@@ -137,8 +137,8 @@ func TestCollectInfoFromOnlineCluster(t *testing.T) {
 		"recover",
 		"collect-info",
 		"--insecure",
-		"--host",
-		tc.Server(2).AdvRPCAddr(),
+		"--host", tc.Server(2).AdvRPCAddr(),
+		"--cluster-name", tc.ClusterName(),
 		replicaInfoFileName,
 	})
 
@@ -554,6 +554,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) {
 			"--confirm=y",
 			"--certs-dir=test_certs",
 			"--host=" + tc.Server(0).AdvRPCAddr(),
+			"--cluster-name=" + tc.ClusterName(),
 			"--plan=" + planFile,
 		})
 	require.NoError(t, err, "failed to run make-plan")
@@ -577,6 +578,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) {
 			"debug", "recover", "apply-plan",
 			"--certs-dir=test_certs",
 			"--host=" + tc.Server(0).AdvRPCAddr(),
+			"--cluster-name=" + tc.ClusterName(),
 			"--confirm=y", planFile,
 		})
 	require.NoError(t, err, "failed to run apply plan")
@@ -592,6 +594,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) {
 			"debug", "recover", "verify",
 			"--certs-dir=test_certs",
 			"--host=" + tc.Server(0).AdvRPCAddr(),
+			"--cluster-name=" + tc.ClusterName(),
 			planFile,
 		})
 	require.NoError(t, err, "failed to run verify plan")
@@ -641,6 +644,7 @@ func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) {
 			"debug", "recover", "verify",
 			"--certs-dir=test_certs",
 			"--host=" + tc.Server(0).AdvRPCAddr(),
+			"--cluster-name=" + tc.ClusterName(),
 			planFile,
 		})
 	require.NoError(t, err, "failed to run verify plan")
 
@@ -1,6 +1,6 @@
 zip
 ----
-debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false /dev/null
+debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=<cluster-name> /dev/null
 [cluster] discovering virtual clusters... done
 [cluster] creating output file /dev/null... done
 [cluster] establishing RPC connection to ...
 
@@ -1,6 +1,6 @@
 zip
 ----
-debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false
+debug zip --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=<cluster-name> /dev/null
 [cluster] discovering virtual clusters... done
 [cluster] creating output file /dev/null... done
 [cluster] establishing RPC connection to ...
 
@@ -1,6 +1,6 @@
 zip
 ----
-debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false /dev/null
+debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=<cluster-name> /dev/null
 [cluster] discovering virtual clusters... done
 [cluster] creating output file /dev/null... done
 [cluster] establishing RPC connection to ...
 
@@ -227,4 +227,4 @@ zip
 [node ?] ? log files found
 [node ?] ? log files found
 [node ?] ? log files found
-debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false /dev/null
+debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=<cluster-name> /dev/null
@@ -421,10 +421,11 @@ func TestConcurrentZip(t *testing.T) {
 	defer func(prevStderr *os.File) { stderr = prevStderr }(stderr)
 	stderr = os.Stdout
 
-	out, err := c.RunWithCapture("debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false " + os.DevNull)
-	if err != nil {
-		t.Fatal(err)
-	}
+	out, err := c.RunWithCapture(fmt.Sprintf(
+		"debug zip --timeout=30s --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=%s %s",
+		tc.ClusterName(), os.DevNull,
+	))
+	require.NoError(t, err)
 
 	// Strip any non-deterministic messages.
 	out = eraseNonDeterministicZipOutput(out)
@@ -437,6 +438,8 @@ func TestConcurrentZip(t *testing.T) {
 	// which the original messages interleve with other messages mean the number
 	// of them after each series is collapsed is also non-derministic.
 	out = regexp.MustCompile(`<dumping SQL tables>\n`).ReplaceAllString(out, "")
+	// Replace the non-deterministic cluster name with a placeholder.
+	out = eraseClusterName(out, tc.ClusterName())
 
 	// We use datadriven simply to read the golden output file; we don't actually
 	// run any commands. Using datadriven allows TESTFLAGS=-rewrite.
@@ -541,9 +544,8 @@ func TestUnavailableZip(t *testing.T) {
 	tc := testcluster.StartTestCluster(t, 3,
 		base.TestClusterArgs{ServerArgs: base.TestServerArgs{
 			DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
-
-			Insecure: true,
-			Knobs:    base.TestingKnobs{Store: knobs},
+			Insecure:          true,
+			Knobs:             base.TestingKnobs{Store: knobs},
 		}})
 	defer tc.Stopper().Stop(context.Background())
 
@@ -559,9 +561,10 @@ func TestUnavailableZip(t *testing.T) {
 	defer close(ch)
 
 	// Run debug zip against node 1.
-	debugZipCommand :=
-		"debug zip --concurrency=1 --cpu-profile-duration=0 " + os.
-			DevNull + " --timeout=.5s"
+	debugZipCommand := fmt.Sprintf(
+		"debug zip --concurrency=1 --cpu-profile-duration=0 --timeout=.5s --cluster-name=%s %s",
+		tc.ClusterName(), os.DevNull,
+	)
 
 	t.Run("server 1", func(t *testing.T) {
 		c := TestCLI{
@@ -651,6 +654,10 @@ func baseZipOutput(nodeId int) []string {
 	return output
 }
 
+func eraseClusterName(str, name string) string {
+	return strings.ReplaceAll(str, name, "<cluster-name>")
+}
+
 func eraseNonDeterministicZipOutput(out string) string {
 	re := regexp.MustCompile(`(?m)postgresql://.*$`)
 	out = re.ReplaceAllString(out, `postgresql://...`)
@@ -736,13 +743,15 @@ func TestPartialZip(t *testing.T) {
 	defer func(prevStderr *os.File) { stderr = prevStderr }(stderr)
 	stderr = os.Stdout
 
-	out, err := c.RunWithCapture("debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false " + os.DevNull)
-	if err != nil {
-		t.Fatal(err)
-	}
+	out, err := c.RunWithCapture(fmt.Sprintf(
+		"debug zip --concurrency=1 --cpu-profile-duration=0s --validate-zip-file=false --cluster-name=%s %s",
+		tc.ClusterName(), os.DevNull,
+	))
+	require.NoError(t, err)
 
 	// Strip any non-deterministic messages.
 	t.Log(out)
+	out = eraseClusterName(out, tc.ClusterName())
 	out = eraseNonDeterministicZipOutput(out)
 
 	datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial1"),
@@ -751,12 +760,13 @@ func TestPartialZip(t *testing.T) {
 		})
 
 	// Now do it again and exclude the down node explicitly.
-	out, err = c.RunWithCapture("debug zip " + os.DevNull + " --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0" +
-		"  --validate-zip-file=false")
-	if err != nil {
-		t.Fatal(err)
-	}
+	out, err = c.RunWithCapture(fmt.Sprintf(
+		"debug zip --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=%s %s",
+		tc.ClusterName(), os.DevNull,
+	))
+	require.NoError(t, err)
 
+	out = eraseClusterName(out, tc.ClusterName())
 	out = eraseNonDeterministicZipOutput(out)
 	datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial1_excluded"),
 		func(t *testing.T, td *datadriven.TestData) string {
@@ -767,12 +777,11 @@ func TestPartialZip(t *testing.T) {
 	// skips over it automatically. We specifically use --wait=none because
 	// we're decommissioning a node in a 3-node cluster, so there's no node to
 	// up-replicate the under-replicated ranges to.
-	{
-		_, err := c.RunWithCapture(fmt.Sprintf("node decommission --checks=skip --wait=none %d", 2))
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
+	_, err = c.RunWithCapture(fmt.Sprintf(
+		"node decommission --checks=skip --wait=none --cluster-name=%s %d",
+		tc.ClusterName(), 2,
+	))
+	require.NoError(t, err)
 
 	// We use .Override() here instead of SET CLUSTER SETTING in SQL to
 	// override the 1m15s minimum placed on the cluster setting. There
@@ -787,12 +796,13 @@ func TestPartialZip(t *testing.T) {
 	datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "partial2"),
 		func(t *testing.T, td *datadriven.TestData) string {
 			f := func() string {
-				out, err := c.RunWithCapture("debug zip --concurrency=1 --cpu-profile-duration=0  --validate-zip-file=false " + os.DevNull)
-				if err != nil {
-					t.Fatal(err)
-				}
-
+				out, err := c.RunWithCapture(fmt.Sprintf(
+					"debug zip --concurrency=1 --cpu-profile-duration=0 --validate-zip-file=false --cluster-name=%s %s",
+					tc.ClusterName(), os.DevNull,
+				))
+				require.NoError(t, err)
 				// Strip any non-deterministic messages.
+				out = eraseClusterName(out, tc.ClusterName())
 				return eraseNonDeterministicZipOutput(out)
 			}