Skip to content

Commit fe68152

Browse files
tbgwenyihu6
authored andcommitted
asim: further improve tracing
Declutter the main datadriven test by extracting out a tracing helper. Revamp the file format so that it includes the seconds elapsed since simulation start, which facilitates lining traces up with asimview. We now also trace lease and replica count rebalancing decisions, as well as any constraint repair. Epic: CRDB-49117
1 parent 66ad1d1 commit fe68152

File tree

5 files changed

+112
-36
lines changed

5 files changed

+112
-36
lines changed

pkg/kv/kvserver/asim/asim.go

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
// in that cluster.
3030
type Simulator struct {
3131
log.AmbientContext
32-
onRecording func(storeID state.StoreID, rec tracingpb.Recording)
32+
onRecording func(storeID state.StoreID, atDuration time.Duration, rec tracingpb.Recording)
3333

3434
curr time.Time
3535
end time.Time
@@ -100,9 +100,9 @@ func NewSimulator(
100100

101101
s := &Simulator{
102102
AmbientContext: log.MakeTestingAmbientCtxWithNewTracer(),
103-
onRecording: func(storeID state.StoreID, rec tracingpb.Recording) {
103+
onRecording: func(storeID state.StoreID, atDuration time.Duration, rec tracingpb.Recording) {
104104
if fn := settings.OnRecording; fn != nil {
105-
fn(int64(storeID), rec)
105+
fn(int64(storeID), atDuration, rec)
106106
}
107107
},
108108
curr: settings.StartTime,
@@ -323,6 +323,25 @@ func (s *Simulator) tickStoreClocks(tick time.Time) {
323323
s.state.TickClock(tick)
324324
}
325325

326+
func (s *Simulator) doAndMaybeTrace(
327+
ctx context.Context,
328+
storeID state.StoreID,
329+
tick time.Time,
330+
op string,
331+
f func(ctx context.Context),
332+
) {
333+
atDuration := tick.Sub(s.settings.StartTime)
334+
335+
var finishAndGetRecording func() tracingpb.Recording
336+
if s.onRecording != nil {
337+
ctx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, s.Tracer, op)
338+
}
339+
f(ctx)
340+
if finishAndGetRecording != nil {
341+
s.onRecording(storeID, atDuration, finishAndGetRecording())
342+
}
343+
}
344+
326345
// tickQueues iterates over the next replicas for each store to
327346
// consider. It then enqueues each of these and ticks the replicate queue for
328347
// processing.
@@ -334,10 +353,16 @@ func (s *Simulator) tickQueues(ctx context.Context, tick time.Time, state state.
334353

335354
// Tick the split queue.
336355
s.sqs[storeID].Tick(ctx, tick, state)
356+
337357
// Tick the replicate queue.
338-
s.rqs[storeID].Tick(ctx, tick, state)
358+
s.doAndMaybeTrace(ctx, storeID, tick, "replicateQueue.PlanOneChange", func(ctx context.Context) {
359+
s.rqs[storeID].Tick(ctx, tick, state)
360+
})
361+
339362
// Tick the lease queue.
340-
s.lqs[storeID].Tick(ctx, tick, state)
363+
s.doAndMaybeTrace(ctx, storeID, tick, "leaseQueue.PlanOneChange", func(ctx context.Context) {
364+
s.lqs[storeID].Tick(ctx, tick, state)
365+
})
341366

342367
// Tick changes that may have been enqueued with a lower completion
343368
// than the current tick, from the queues.
@@ -389,14 +414,9 @@ func (s *Simulator) tickMMStoreRebalancers(ctx context.Context, tick time.Time,
389414
stores := s.state.Stores()
390415
s.shuffler(len(stores), func(i, j int) { stores[i], stores[j] = stores[j], stores[i] })
391416
for _, store := range stores {
392-
var finishAndGetRecording func() tracingpb.Recording
393-
if s.onRecording != nil {
394-
ctx, finishAndGetRecording = tracing.ContextWithRecordingSpan(ctx, s.Tracer, "mma.ComputeChanges")
395-
}
396-
s.mmSRs[store.StoreID()].Tick(ctx, tick, state)
397-
if finishAndGetRecording != nil {
398-
s.onRecording(store.StoreID(), finishAndGetRecording())
399-
}
417+
s.doAndMaybeTrace(ctx, store.StoreID(), tick, "mma.ComputeChanges", func(ctx context.Context) {
418+
s.mmSRs[store.StoreID()].Tick(ctx, tick, state)
419+
})
400420
}
401421
}
402422

pkg/kv/kvserver/asim/config/settings.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ type SimulationSettings struct {
119119
ST *cluster.Settings
120120
// OnRecording is called with trace spans obtained by recording the allocator.
121121
// NB: we can't use state.StoreID here since that causes an import cycle.
122-
OnRecording func(storeID int64, rec tracingpb.Recording)
122+
OnRecording func(storeID int64, atDuration time.Duration, rec tracingpb.Recording)
123123
}
124124

125125
// DefaultSimulationSettings returns a set of default settings for simulation.

pkg/kv/kvserver/asim/tests/BUILD.bazel

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ go_library(
88
"rand_framework.go",
99
"rand_gen.go",
1010
"rand_util.go",
11+
"tracer_helper.go",
1112
],
1213
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/tests",
1314
visibility = ["//visibility:public"],
@@ -25,6 +26,9 @@ go_library(
2526
"//pkg/sql/catalog/catpb",
2627
"//pkg/sql/catalog/descpb",
2728
"//pkg/sql/catalog/multiregion",
29+
"//pkg/util/tracing/tracingpb",
30+
"@com_github_stretchr_testify//assert",
31+
"@com_github_stretchr_testify//require",
2832
],
2933
)
3034

@@ -62,7 +66,6 @@ go_test(
6266
"//pkg/util/tracing/tracingpb",
6367
"@com_github_cockroachdb_datadriven//:datadriven",
6468
"@com_github_cockroachdb_logtags//:logtags",
65-
"@com_github_stretchr_testify//assert",
6669
"@com_github_stretchr_testify//require",
6770
],
6871
)

pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"math/rand"
1313
"os"
1414
"path/filepath"
15-
"regexp"
1615
"strings"
1716
"testing"
1817
"time"
@@ -38,7 +37,6 @@ import (
3837
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
3938
"github.com/cockroachdb/datadriven"
4039
"github.com/cockroachdb/logtags"
41-
"github.com/stretchr/testify/assert"
4240
"github.com/stretchr/testify/require"
4341
)
4442

@@ -556,25 +554,9 @@ func TestDataDriven(t *testing.T) {
556554

557555
seedGen := rand.New(rand.NewSource(seed))
558556
for sample := 0; sample < samples; sample++ {
559-
recIdx := map[int64]int{}
560-
settingsGen.Settings.OnRecording = func(storeID int64, rec tracingpb.Recording) {
561-
if !rewrite || len(rec[0].Logs) == 0 {
562-
return
563-
}
564-
traceDir := filepath.Join(plotDir, "traces", fmt.Sprintf("s%d", storeID))
565-
if recIdx[storeID] == 0 {
566-
require.NoError(t, os.MkdirAll(traceDir, 0755))
567-
}
568-
re := regexp.MustCompile(`[^a-zA-Z0-9]+`)
569-
outName := fmt.Sprintf("%s_%s_s%d", mv, re.ReplaceAllString(rec[0].Operation, "_"), storeID)
570-
if sample > 0 {
571-
outName += fmt.Sprintf("_sample%d", sample+1)
572-
}
573-
outName += "_" + fmt.Sprintf("%03d.txt", recIdx[storeID])
574-
assert.NoError(t, os.WriteFile(
575-
filepath.Join(traceDir, outName),
576-
[]byte(rec.String()), 0644))
577-
recIdx[storeID] += 1
557+
tr := makeTraceHelper(rewrite, plotDir, testName, sample+1, duration)
558+
settingsGen.Settings.OnRecording = func(storeID int64, atDuration time.Duration, rec tracingpb.Recording) {
559+
tr.OnRecording(t, storeID, atDuration, rec)
578560
}
579561

580562
assertionFailures := []string{}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package tests
7+
8+
import (
9+
"fmt"
10+
"os"
11+
"path/filepath"
12+
"regexp"
13+
"strconv"
14+
"testing"
15+
"time"
16+
17+
"github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb"
18+
"github.com/stretchr/testify/assert"
19+
"github.com/stretchr/testify/require"
20+
)
21+
22+
type traceHelper struct {
23+
enabled bool
24+
testName string
25+
plotDir string
26+
sample int
27+
28+
traceDirCreated map[int64]struct{} // by storeID
29+
atDurationFormat string
30+
}
31+
32+
func (tr traceHelper) OnRecording(
33+
t *testing.T, storeID int64, atDuration time.Duration, rec tracingpb.Recording,
34+
) {
35+
if !tr.enabled || len(rec[0].Logs) == 0 {
36+
return
37+
}
38+
39+
traceDir := filepath.Join(tr.plotDir, "traces", fmt.Sprintf("s%d", storeID))
40+
if _, ok := tr.traceDirCreated[storeID]; !ok {
41+
tr.traceDirCreated[storeID] = struct{}{}
42+
require.NoError(t, os.MkdirAll(traceDir, 0755))
43+
}
44+
re := regexp.MustCompile(`[^a-zA-Z0-9.]+`)
45+
46+
var sampleS string
47+
if tr.sample > 0 {
48+
sampleS = strconv.Itoa(tr.sample)
49+
}
50+
outName := fmt.Sprintf("%s%s_%s_%s_s%d", tr.testName, sampleS, fmt.Sprintf(tr.atDurationFormat, atDuration.Seconds()),
51+
re.ReplaceAllString(rec[0].Operation, "_"), storeID)
52+
assert.NoError(t, os.WriteFile(
53+
filepath.Join(traceDir, outName),
54+
[]byte(rec.String()), 0644))
55+
}
56+
57+
func makeTraceHelper(
58+
enabled bool, plotDir string, testName string, sample int, duration time.Duration,
59+
) traceHelper {
60+
secondsSinceBeginningWidth := len(fmt.Sprintf("%d", int64(duration.Seconds()+1)))
61+
return traceHelper{
62+
enabled: enabled,
63+
testName: testName,
64+
plotDir: plotDir,
65+
sample: sample,
66+
traceDirCreated: map[int64]struct{}{},
67+
// Print seconds with two decimal places. The decimal point and two decimal
68+
// places need three more characters.
69+
atDurationFormat: fmt.Sprintf("%%0%d.2fs", secondsSinceBeginningWidth+3),
70+
}
71+
}

0 commit comments

Comments
 (0)