asim: trace mma.ComputeChanges and save to generated artifacts

tbg · tbg · commit c24cb04fa900 · 2025-09-19T10:06:58.000+02:00
diff --git a/pkg/kv/kvserver/asim/BUILD.bazel b/pkg/kv/kvserver/asim/BUILD.bazel
@@ -18,6 +18,8 @@ go_library(
         "//pkg/kv/kvserver/asim/storerebalancer",
         "//pkg/kv/kvserver/asim/workload",
         "//pkg/util/log",
+        "//pkg/util/tracing",
+        "//pkg/util/tracing/tracingpb",
     ],
 )
 
diff --git a/pkg/kv/kvserver/asim/asim.go b/pkg/kv/kvserver/asim/asim.go
@@ -21,12 +21,16 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/storerebalancer"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/workload"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/cockroach/pkg/util/tracing"
+	"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
 )
 
 // Simulator simulates an entire cluster, and runs the allocator of each store
 // in that cluster.
 type Simulator struct {
 	log.AmbientContext
+	onRecording func(storeID state.StoreID, rec tracingpb.Recording)
+
 	curr time.Time
 	end  time.Time
 	// interval is the step between ticks for active simulaton components, such
@@ -96,22 +100,27 @@ func NewSimulator(
 
 	s := &Simulator{
 		AmbientContext: log.MakeTestingAmbientCtxWithNewTracer(),
-		curr:           settings.StartTime,
-		end:            settings.StartTime.Add(duration),
-		interval:       settings.TickInterval,
-		generators:     wgs,
-		state:          initialState,
-		changer:        changer,
-		rqs:            rqs,
-		lqs:            lqs,
-		sqs:            sqs,
-		controllers:    controllers,
-		srs:            srs,
-		mmSRs:          mmSRs,
-		pacers:         pacers,
-		gossip:         gossip.NewGossip(initialState, settings),
-		metrics:        m,
-		shuffler:       state.NewShuffler(settings.Seed),
+		onRecording: func(storeID state.StoreID, rec tracingpb.Recording) {
+			if fn := settings.OnRecording; fn != nil {
+				fn(int64(storeID), rec)
+			}
+		},
+		curr:        settings.StartTime,
+		end:         settings.StartTime.Add(duration),
+		interval:    settings.TickInterval,
+		generators:  wgs,
+		state:       initialState,
+		changer:     changer,
+		rqs:         rqs,
+		lqs:         lqs,
+		sqs:         sqs,
+		controllers: controllers,
+		srs:         srs,
+		mmSRs:       mmSRs,
+		pacers:      pacers,
+		gossip:      gossip.NewGossip(initialState, settings),
+		metrics:     m,
+		shuffler:    state.NewShuffler(settings.Seed),
 		// TODO(kvoli): Keeping the state around is a bit hacky, find a better
 		// method of reporting the ranges.
 		history:       history.History{Recorded: [][]metrics.StoreMetrics{}, S: initialState},
@@ -380,7 +389,14 @@ func (s *Simulator) tickMMStoreRebalancers(ctx context.Context, tick time.Time,
 	stores := s.state.Stores()
 	s.shuffler(len(stores), func(i, j int) { stores[i], stores[j] = stores[j], stores[i] })
 	for _, store := range stores {
+		var finishAndGetRecording func() tracingpb.Recording
+		if s.onRecording != nil {
+			ctx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, s.Tracer, "mma.ComputeChanges")
+		}
 		s.mmSRs[store.StoreID()].Tick(ctx, tick, state)
+		if finishAndGetRecording != nil {
+			s.onRecording(store.StoreID(), finishAndGetRecording())
+		}
 	}
 }
 
diff --git a/pkg/kv/kvserver/asim/config/BUILD.bazel b/pkg/kv/kvserver/asim/config/BUILD.bazel
@@ -5,5 +5,8 @@ go_library(
     srcs = ["settings.go"],
     importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/config",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/settings/cluster"],
+    deps = [
+        "//pkg/settings/cluster",
+        "//pkg/util/tracing/tracingpb",
+    ],
 )
diff --git a/pkg/kv/kvserver/asim/config/settings.go b/pkg/kv/kvserver/asim/config/settings.go
@@ -9,6 +9,7 @@ import (
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
+	"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
 )
 
 const (
@@ -116,6 +117,9 @@ type SimulationSettings struct {
 	// TODO(wenyihu6): Remove any non-simulation settings from this struct and
 	// instead override the settings below.
 	ST *cluster.Settings
+	// OnRecording is called with trace spans obtained by recording the allocator.
+	// NB: we can't use state.StoreID here since that causes an import cycle.
+	OnRecording func(storeID int64, rec tracingpb.Recording)
 }
 
 // DefaultSimulationSettings returns a set of default settings for simulation.
diff --git a/pkg/kv/kvserver/asim/tests/BUILD.bazel b/pkg/kv/kvserver/asim/tests/BUILD.bazel
@@ -58,8 +58,10 @@ go_test(
         "//pkg/util/humanizeutil",
         "//pkg/util/leaktest",
         "//pkg/util/log",
+        "//pkg/util/tracing/tracingpb",
         "@com_github_cockroachdb_datadriven//:datadriven",
         "@com_github_cockroachdb_logtags//:logtags",
+        "@com_github_stretchr_testify//assert",
         "@com_github_stretchr_testify//require",
     ],
 )
diff --git a/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go b/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go
@@ -13,6 +13,7 @@ import (
 	"math/rand"
 	"os"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"testing"
 	"time"
@@ -35,8 +36,10 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
+	"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
 	"github.com/cockroachdb/datadriven"
 	"github.com/cockroachdb/logtags"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -165,6 +168,12 @@ var runAsimTests = envutil.EnvOrDefaultBool("COCKROACH_RUN_ASIM_TESTS", false)
 //     random number generator that creates the seed used to generate each
 //     simulation sample. The default values are: duration=30m (30 minutes)
 //     samples=1 seed=random.
+//
+// To run all tests and rewrite the testdata files as well as generate the
+// artifacts in `testdata/generated`, you can use:
+/*
+./dev test pkg/kv/kvserver/asim/tests --ignore-cache --rewrite -v -f TestDataDriven -- --test_env COCKROACH_RUN_ASIM_TESTS=true --test_env COCKROACH_ALWAYS_KEEP_TEST_LOGS=true
+*/
 func TestDataDriven(t *testing.T) {
 	skip.UnderDuressWithIssue(t, 149875)
 	leakTestAfter := leaktest.AfterTest(t)
@@ -531,7 +540,34 @@ func TestDataDriven(t *testing.T) {
 						require.NotNil(t, set, "unknown mode value: %s", mv)
 						set(&eventGen)
 
+						// TODO(tbg): need to decide whether multiple evals in a single file
+						// is a feature or an anti-pattern. If it's a feature, we should let
+						// the `name` part below be adjustable (but not the plotDir) via a
+						// parameter to the `eval` command.
+						testName := name + "_" + mv
+
 						for sample := 0; sample < samples; sample++ {
+							recIdx := map[int64]int{}
+							settingsGen.Settings.OnRecording = func(storeID int64, rec tracingpb.Recording) {
+								if !rewrite || len(rec[0].Logs) == 0 {
+									return
+								}
+								traceDir := filepath.Join(plotDir, "traces", fmt.Sprintf("s%d", storeID))
+								if recIdx[storeID] == 0 {
+									require.NoError(t, os.MkdirAll(traceDir, 0755))
+								}
+								re := regexp.MustCompile(`[^a-zA-Z0-9]+`)
+								outName := fmt.Sprintf("%s_%s_s%d", mv, re.ReplaceAllString(rec[0].Operation, "_"), storeID)
+								if sample > 0 {
+									outName += fmt.Sprintf("_sample%d", sample+1)
+								}
+								outName += "_" + fmt.Sprintf("%03d.txt", recIdx[storeID])
+								assert.NoError(t, os.WriteFile(
+									filepath.Join(traceDir, outName),
+									[]byte(rec.String()), 0644))
+								recIdx[storeID] += 1
+							}
+
 							assertionFailures := []string{}
 							var tmpStrB *strings.Builder = nil
 							if stateStrForOnce == "" {
@@ -561,11 +597,7 @@ func TestDataDriven(t *testing.T) {
 						// Generate artifacts. Hash artifact input data to ensure they are
 						// up to date.
 						hasher := fnv.New64a()
-						// TODO(tbg): need to decide whether multiple evals in a single file
-						// is a feature or an anti-pattern. If it's a feature, we should let
-						// the `name` part below be adjustable (but not the plotDir) via a
-						// parameter to the `eval` command.
-						testName := name + "_" + mv
+
 						for sample, h := range run.hs {
 							printStatsAndGenerateJSON(t, &buf, h, testName, sample+1, plotDir, hasher, rewrite,
 								settingsGen.Settings.TickInterval, metricsMap)
diff --git a/pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_setup.txt b/pkg/kv/kvserver/asim/tests/testdata/generated/example_rebalancing/example_rebalancing_setup.txt
@@ -13,4 +13,5 @@ Event
 Workload Set Up
 	[1,10000): 95%r large-block [128-256B/op, 7000ops/s]
 Changed Settings
-	StateExchangeDelay: 20s (default: 500ms)
+	StateExchangeDelay: 20s (default: 500ms)
+	OnRecording: 0x10487ef50 (default: <nil>)
diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma_skewed_cpu_skewed_write.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/mma_skewed_cpu_skewed_write.txt
@@ -73,4 +73,5 @@ Workload Set Up
 	[10001,20000): write-only large-block [0.00cpu-us/write(raft), 1000B/op, 20000ops/s]
 Changed Settings
 	SplitQueueEnabled: false (default: true)
+	OnRecording: 0x10487ef50 (default: <nil>)
 ==========================

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,8 @@ go_library(`
`18`	`18`	`"//pkg/kv/kvserver/asim/storerebalancer",`
`19`	`19`	`"//pkg/kv/kvserver/asim/workload",`
`20`	`20`	`"//pkg/util/log",`
	`21`	`+ "//pkg/util/tracing",`
	`22`	`+ "//pkg/util/tracing/tracingpb",`
`21`	`23`	`],`
`22`	`24`	`)`
`23`	`25`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ import (`
`9`	`9`	`"time"`
`10`	`10`
`11`	`11`	`"github.com/cockroachdb/cockroach/pkg/settings/cluster"`
	`12`	`+ "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"`
`12`	`13`	`)`
`13`	`14`
`14`	`15`	`const (`
`@@ -116,6 +117,9 @@ type SimulationSettings struct {`
`116`	`117`	`// TODO(wenyihu6): Remove any non-simulation settings from this struct and`
`117`	`118`	`// instead override the settings below.`
`118`	`119`	`ST *cluster.Settings`
	`120`	`+ // OnRecording is called with trace spans obtained by recording the allocator.`
	`121`	`+ // NB: we can't use state.StoreID here since that causes an import cycle.`
	`122`	`+ OnRecording func(storeID int64, rec tracingpb.Recording)`
`119`	`123`	`}`
`120`	`124`
`121`	`125`	`// DefaultSimulationSettings returns a set of default settings for simulation.`