Skip to content

Commit c34c845

Browse files
craig[bot]tbg
andcommitted
Merge #153584
153584: asim: trace mma.ComputeChanges and save to generated artifacts r=tbg a=tbg This makes it much easier to figure out what a particular store's MMA was doing. Traces will be in `generated/<testname>/traces/<store>`. Epic: CRDB-49117 Co-authored-by: Tobias Grieger <[email protected]>
2 parents a4e63be + 7d20fcc commit c34c845

37 files changed

+140
-98
lines changed

pkg/kv/kvserver/asim/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ go_library(
1818
"//pkg/kv/kvserver/asim/storerebalancer",
1919
"//pkg/kv/kvserver/asim/workload",
2020
"//pkg/util/log",
21+
"//pkg/util/tracing",
22+
"//pkg/util/tracing/tracingpb",
2123
],
2224
)
2325

pkg/kv/kvserver/asim/asim.go

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,16 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/storerebalancer"
2222
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/workload"
2323
"github.com/cockroachdb/cockroach/pkg/util/log"
24+
"github.com/cockroachdb/cockroach/pkg/util/tracing"
25+
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
2426
)
2527

2628
// Simulator simulates an entire cluster, and runs the allocator of each store
2729
// in that cluster.
2830
type Simulator struct {
2931
log.AmbientContext
32+
onRecording func(storeID state.StoreID, rec tracingpb.Recording)
33+
3034
curr time.Time
3135
end time.Time
3236
// interval is the step between ticks for active simulaton components, such
@@ -96,22 +100,27 @@ func NewSimulator(
96100

97101
s := &Simulator{
98102
AmbientContext: log.MakeTestingAmbientCtxWithNewTracer(),
99-
curr: settings.StartTime,
100-
end: settings.StartTime.Add(duration),
101-
interval: settings.TickInterval,
102-
generators: wgs,
103-
state: initialState,
104-
changer: changer,
105-
rqs: rqs,
106-
lqs: lqs,
107-
sqs: sqs,
108-
controllers: controllers,
109-
srs: srs,
110-
mmSRs: mmSRs,
111-
pacers: pacers,
112-
gossip: gossip.NewGossip(initialState, settings),
113-
metrics: m,
114-
shuffler: state.NewShuffler(settings.Seed),
103+
onRecording: func(storeID state.StoreID, rec tracingpb.Recording) {
104+
if fn := settings.OnRecording; fn != nil {
105+
fn(int64(storeID), rec)
106+
}
107+
},
108+
curr: settings.StartTime,
109+
end: settings.StartTime.Add(duration),
110+
interval: settings.TickInterval,
111+
generators: wgs,
112+
state: initialState,
113+
changer: changer,
114+
rqs: rqs,
115+
lqs: lqs,
116+
sqs: sqs,
117+
controllers: controllers,
118+
srs: srs,
119+
mmSRs: mmSRs,
120+
pacers: pacers,
121+
gossip: gossip.NewGossip(initialState, settings),
122+
metrics: m,
123+
shuffler: state.NewShuffler(settings.Seed),
115124
// TODO(kvoli): Keeping the state around is a bit hacky, find a better
116125
// method of reporting the ranges.
117126
history: history.History{Recorded: [][]metrics.StoreMetrics{}, S: initialState},
@@ -380,7 +389,14 @@ func (s *Simulator) tickMMStoreRebalancers(ctx context.Context, tick time.Time,
380389
stores := s.state.Stores()
381390
s.shuffler(len(stores), func(i, j int) { stores[i], stores[j] = stores[j], stores[i] })
382391
for _, store := range stores {
392+
var finishAndGetRecording func() tracingpb.Recording
393+
if s.onRecording != nil {
394+
ctx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, s.Tracer, "mma.ComputeChanges")
395+
}
383396
s.mmSRs[store.StoreID()].Tick(ctx, tick, state)
397+
if finishAndGetRecording != nil {
398+
s.onRecording(store.StoreID(), finishAndGetRecording())
399+
}
384400
}
385401
}
386402

pkg/kv/kvserver/asim/config/BUILD.bazel

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,8 @@ go_library(
55
srcs = ["settings.go"],
66
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/config",
77
visibility = ["//visibility:public"],
8-
deps = ["//pkg/settings/cluster"],
8+
deps = [
9+
"//pkg/settings/cluster",
10+
"//pkg/util/tracing/tracingpb",
11+
],
912
)

pkg/kv/kvserver/asim/config/settings.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
12+
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
1213
)
1314

1415
const (
@@ -116,6 +117,9 @@ type SimulationSettings struct {
116117
// TODO(wenyihu6): Remove any non-simulation settings from this struct and
117118
// instead override the settings below.
118119
ST *cluster.Settings
120+
// OnRecording is called with trace spans obtained by recording the allocator.
121+
// NB: we can't use state.StoreID here since that causes an import cycle.
122+
OnRecording func(storeID int64, rec tracingpb.Recording)
119123
}
120124

121125
// DefaultSimulationSettings returns a set of default settings for simulation.

pkg/kv/kvserver/asim/gen/printer.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ func compareSettingsToDefault(settings config.SimulationSettings) string {
3333
// Skip cluster setting ST and seed. The simulation seed is derived from
3434
// rand.New(rand.NewSource(42)).Int63() by default, while the default
3535
// simulation setting itself uses 42.
36-
if field.Name == "ST" || field.Name == "Seed" {
36+
// We also skip the OnRecording callback.
37+
if field.Name == "ST" || field.Name == "Seed" || field.Name == "OnRecording" {
3738
continue
3839
}
3940

pkg/kv/kvserver/asim/tests/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ go_test(
5858
"//pkg/util/humanizeutil",
5959
"//pkg/util/leaktest",
6060
"//pkg/util/log",
61+
"//pkg/util/tracing/tracingpb",
6162
"@com_github_cockroachdb_datadriven//:datadriven",
6263
"@com_github_cockroachdb_logtags//:logtags",
64+
"@com_github_stretchr_testify//assert",
6365
"@com_github_stretchr_testify//require",
6466
],
6567
)

pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ package tests
88
import (
99
"context"
1010
"fmt"
11-
"hash"
1211
"hash/fnv"
1312
"math/rand"
1413
"os"
1514
"path/filepath"
15+
"regexp"
1616
"strings"
1717
"testing"
1818
"time"
@@ -35,8 +35,10 @@ import (
3535
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
3636
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3737
"github.com/cockroachdb/cockroach/pkg/util/log"
38+
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
3839
"github.com/cockroachdb/datadriven"
3940
"github.com/cockroachdb/logtags"
41+
"github.com/stretchr/testify/assert"
4042
"github.com/stretchr/testify/require"
4143
)
4244

@@ -165,6 +167,12 @@ var runAsimTests = envutil.EnvOrDefaultBool("COCKROACH_RUN_ASIM_TESTS", false)
165167
// random number generator that creates the seed used to generate each
166168
// simulation sample. The default values are: duration=30m (30 minutes)
167169
// samples=1 seed=random.
170+
//
171+
// To run all tests and rewrite the testdata files as well as generate the
172+
// artifacts in `testdata/generated`, you can use:
173+
/*
174+
./dev test pkg/kv/kvserver/asim/tests --ignore-cache --rewrite -v -f TestDataDriven -- --test_env COCKROACH_RUN_ASIM_TESTS=true --test_env COCKROACH_ALWAYS_KEEP_TEST_LOGS=true
175+
*/
168176
func TestDataDriven(t *testing.T) {
169177
skip.UnderDuressWithIssue(t, 149875)
170178
leakTestAfter := leaktest.AfterTest(t)
@@ -531,7 +539,34 @@ func TestDataDriven(t *testing.T) {
531539
require.NotNil(t, set, "unknown mode value: %s", mv)
532540
set(&eventGen)
533541

542+
// TODO(tbg): need to decide whether multiple evals in a single file
543+
// is a feature or an anti-pattern. If it's a feature, we should let
544+
// the `name` part below be adjustable (but not the plotDir) via a
545+
// parameter to the `eval` command.
546+
testName := name + "_" + mv
547+
534548
for sample := 0; sample < samples; sample++ {
549+
recIdx := map[int64]int{}
550+
settingsGen.Settings.OnRecording = func(storeID int64, rec tracingpb.Recording) {
551+
if !rewrite || len(rec[0].Logs) == 0 {
552+
return
553+
}
554+
traceDir := filepath.Join(plotDir, "traces", fmt.Sprintf("s%d", storeID))
555+
if recIdx[storeID] == 0 {
556+
require.NoError(t, os.MkdirAll(traceDir, 0755))
557+
}
558+
re := regexp.MustCompile(`[^a-zA-Z0-9]+`)
559+
outName := fmt.Sprintf("%s_%s_s%d", mv, re.ReplaceAllString(rec[0].Operation, "_"), storeID)
560+
if sample > 0 {
561+
outName += fmt.Sprintf("_sample%d", sample+1)
562+
}
563+
outName += "_" + fmt.Sprintf("%03d.txt", recIdx[storeID])
564+
assert.NoError(t, os.WriteFile(
565+
filepath.Join(traceDir, outName),
566+
[]byte(rec.String()), 0644))
567+
recIdx[storeID] += 1
568+
}
569+
535570
assertionFailures := []string{}
536571
var tmpStrB *strings.Builder = nil
537572
if stateStrForOnce == "" {
@@ -561,17 +596,10 @@ func TestDataDriven(t *testing.T) {
561596
// Generate artifacts. Hash artifact input data to ensure they are
562597
// up to date.
563598
hasher := fnv.New64a()
564-
// TODO(tbg): need to decide whether multiple evals in a single file
565-
// is a feature or an anti-pattern. If it's a feature, we should let
566-
// the `name` part below be adjustable (but not the plotDir) via a
567-
// parameter to the `eval` command.
568-
testName := name + "_" + mv
599+
569600
for sample, h := range run.hs {
570-
generateAllPlots(t, &buf, h, testName, sample+1, plotDir, hasher, rewrite,
601+
printStatsAndGenerateJSON(t, &buf, h, testName, sample+1, plotDir, hasher, rewrite,
571602
settingsGen.Settings.TickInterval, metricsMap)
572-
generateTopology(t, h,
573-
filepath.Join(plotDir, fmt.Sprintf("%s_%d_topology.txt", testName, sample+1)),
574-
hasher, rewrite)
575603
}
576604
artifactsHash := hasher.Sum64()
577605

@@ -707,22 +735,6 @@ type modeHistory struct {
707735
hs []history.History
708736
}
709737

710-
func generateTopology(
711-
t *testing.T, h history.History, topFile string, hasher hash.Hash, rewrite bool,
712-
) {
713-
// TODO(tbg): this can in principle be printed without even
714-
// evaluating the test, and in particular it's independent of
715-
// settings. It seems like an artifact of the implementation
716-
// that we can only access the structured topology after the
717-
// simulation has run.
718-
top := h.S.Topology()
719-
s := top.String()
720-
_, _ = fmt.Fprint(hasher, s)
721-
if rewrite {
722-
require.NoError(t, os.WriteFile(topFile, []byte(s), 0644))
723-
}
724-
}
725-
726738
// writeStateStrToFile writes the state string to the given file.
727739
func writeStateStrToFile(t *testing.T, topFile string, stateStr string, rewrite bool) {
728740
if rewrite {

pkg/kv/kvserver/asim/tests/plotting_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,13 @@ import (
2121
"github.com/stretchr/testify/require"
2222
)
2323

24-
// generateAllPlots creates plots for all available metrics from simulation history.
24+
// printStatsAndGenerateJSON creates plots for all available metrics from simulation history.
2525
// All plot files are hashed directly to the provided hasher.
2626
// If rewrite is false, plots are generated but not saved to disk.
2727
// Returns a slice of filenames for all generated plots.
2828
//
2929
// TODO(tbg): introduce a SimulationEnv and make this a method on it.
30-
// TODO(tbg): rename to generateArtifacts.
31-
func generateAllPlots(
30+
func printStatsAndGenerateJSON(
3231
t *testing.T,
3332
buf *strings.Builder,
3433
h history.History,

pkg/kv/kvserver/asim/tests/testdata/generated/.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
!*/
55

66
# Except this specific directory (relative to .gitignore)
7+
# Note that this intentionally does not include subdirectories (such as traces)
8+
# which are not deterministic.
79
!example_rebalancing/**
10+
**/traces
811
# And the readme file.
9-
!README.md
12+
!README.md

pkg/kv/kvserver/asim/tests/testdata/non_rand/decommission_conformance.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ assertion type=conformance under=0 over=0 unavailable=0 violating=0
2828

2929
eval duration=20m cfgs=(sma-count,mma-only)
3030
----
31-
artifacts[sma-count]: 19128202c9e746ab
31+
artifacts[sma-count]: 8e9d6ada04cb9561
3232
==========================
33-
artifacts[mma-only]: d80019be362b72f7
33+
artifacts[mma-only]: 239144d63ffded95
3434
==========================

0 commit comments

Comments
 (0)