Skip to content

Commit 7b66342

Browse files
craig[bot]dhartunian
andcommitted
Merge #149373
149373: tracing: add background execution tracer r=dhartunian a=dhartunian This commit adds a background go execution trace capture mechanism to CRDB. The execution traces are captured with a duration and interval set by cluster settings. When the `obs.execution_tracer.interval` cluster setting is set to a positive value, the execution tracer will be enabled on every node and begin capturing traces with duration set by the `obs.execution_tracer.duration` cluster setting (10s by default). The traces will be captured in the log directory much like cpu and heap profiles in a direcory named `executiontrace_dump`. The trace capture will automatically garbage collect the oldest traces if the total file size consumed by the execution traces exceeds the value of the `obs.execution_tracer.total_dump_size_limit` cluster setting (4 GiB by default). It is encouraged to leave the dump size limit high, and the trace capture duration low in order to allow for files that are more easily consumable in external tools. One gigantic file will be much more challenging to parse and navigate. Please note that enabling execution trace capture will incur a performance penalty. This is currently estimated at 1-2% (https://go.dev/blog/execution-traces-2024#low-overhead-tracing). No specific measurements have been made with CRDB at the time of this commit. Based on work in #146873 Part of: #97215 Release note (observability change, ops change): A new feature that captures go execution traces on an automatic schedule is now available. This feature will incur a performance penalty and customers will typically use this under the guidance of Cockroach Labs support. There are 3 cluster settings which manipulate this feature: - `obs.execution_tracer.interval`: set to >0 to enable the tracer and capture traces at the given interval. - `obs.execution_tracer.duration`: sets the time over which each trace captures data. - `obs.execution_tracer.total_dump_size_limit` : sets the total amount of disk space to allow execution traces to take up before the oldest files are garbage collected automatically. Co-authored-by: David Hartunian <[email protected]>
2 parents 1206bb5 + 57f7a68 commit 7b66342

17 files changed

+391
-30
lines changed

pkg/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ ALL_TESTS = [
803803
"//pkg/util/timeutil:timeutil_test",
804804
"//pkg/util/tochar:tochar_test",
805805
"//pkg/util/tracing/collector:collector_test",
806+
"//pkg/util/tracing/goexectrace:goexectrace_test",
806807
"//pkg/util/tracing/grpcinterceptor:grpcinterceptor_test",
807808
"//pkg/util/tracing/service:service_test",
808809
"//pkg/util/tracing:tracing_test",
@@ -2747,6 +2748,8 @@ GO_TARGETS = [
27472748
"//pkg/util/tochar:tochar_test",
27482749
"//pkg/util/tracing/collector:collector",
27492750
"//pkg/util/tracing/collector:collector_test",
2751+
"//pkg/util/tracing/goexectrace:goexectrace",
2752+
"//pkg/util/tracing/goexectrace:goexectrace_test",
27502753
"//pkg/util/tracing/grpcinterceptor:grpcinterceptor",
27512754
"//pkg/util/tracing/grpcinterceptor:grpcinterceptor_test",
27522755
"//pkg/util/tracing/service:service",

pkg/base/constants.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ const (
4141
// stores profiles when the periodic CPU profile dump is enabled.
4242
CPUProfileDir = "pprof_dump"
4343

44+
// ExecutionTraceDir is the directory name that holds Go execution traces
45+
// when the execution trace dumper is enabled.
46+
ExecutionTraceDir = "executiontrace_dump"
47+
4448
// InflightTraceDir is the directory name where the job trace dumper stores traces
4549
// when a job opts in to dumping its execution traces.
4650
InflightTraceDir = "inflight_trace_dump"

pkg/cli/interactive_tests/test_log_flags.tcl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ send "$argv start-single-node --insecure --store=path=logs/mystore2 --log-dir=\r
2727
eexpect "node starting"
2828
interrupt
2929
eexpect ":/# "
30-
send "ls logs/mystore2/logs 2>/dev/null | grep -vE 'heap_profiler|goroutine_dump|inflight_trace_dump|pprof_dump' | wc -l\r"
30+
send "ls logs/mystore2/logs 2>/dev/null | grep -vE 'executiontrace_dump|heap_profiler|goroutine_dump|inflight_trace_dump|pprof_dump' | wc -l\r"
3131
eexpect "0"
3232
eexpect ":/# "
3333
end_test

pkg/cli/log_flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ func setupLogging(ctx context.Context, cmd *cobra.Command, isServerCmd, applyCon
240240
serverCfg.GoroutineDumpDirName = filepath.Join(outputDirectory, base.GoroutineDumpDir)
241241
serverCfg.HeapProfileDirName = filepath.Join(outputDirectory, base.HeapProfileDir)
242242
serverCfg.CPUProfileDirName = filepath.Join(outputDirectory, base.CPUProfileDir)
243+
serverCfg.ExecutionTraceDirName = filepath.Join(outputDirectory, base.ExecutionTraceDir)
243244
serverCfg.InflightTraceDirName = filepath.Join(outputDirectory, base.InflightTraceDir)
244245

245246
return nil

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ go_library(
342342
"//pkg/util/timeutil/ptp",
343343
"//pkg/util/tracing",
344344
"//pkg/util/tracing/collector",
345+
"//pkg/util/tracing/goexectrace",
345346
"//pkg/util/tracing/service",
346347
"//pkg/util/tracing/tracingpb",
347348
"//pkg/util/tracing/tracingservicepb",

pkg/server/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,9 @@ type BaseConfig struct {
183183
// Only used if DisableRuntimeStatsMonitor is false.
184184
CPUProfileDirName string
185185

186+
// ExecutionTraceDirName is the directory name for Go execution traces.
187+
ExecutionTraceDirName string
188+
186189
// InflightTraceDirName is the directory name for job traces.
187190
InflightTraceDirName string
188191

pkg/server/env_sampler.go

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/util/mon"
2222
"github.com/cockroachdb/cockroach/pkg/util/stop"
2323
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
24+
"github.com/cockroachdb/cockroach/pkg/util/tracing/goexectrace"
2425
"github.com/cockroachdb/errors"
2526
)
2627

@@ -40,16 +41,17 @@ var jemallocPurgePeriod = settings.RegisterDurationSettingWithExplicitUnit(
4041
)
4142

4243
type sampleEnvironmentCfg struct {
43-
st *cluster.Settings
44-
stopper *stop.Stopper
45-
minSampleInterval time.Duration
46-
goroutineDumpDirName string
47-
heapProfileDirName string
48-
cpuProfileDirName string
49-
runtime *status.RuntimeStatSampler
50-
sessionRegistry *sql.SessionRegistry
51-
rootMemMonitor *mon.BytesMonitor
52-
cgoMemTarget uint64
44+
st *cluster.Settings
45+
stopper *stop.Stopper
46+
minSampleInterval time.Duration
47+
goroutineDumpDirName string
48+
heapProfileDirName string
49+
cpuProfileDirName string
50+
executionTraceDirName string
51+
runtime *status.RuntimeStatSampler
52+
sessionRegistry *sql.SessionRegistry
53+
rootMemMonitor *mon.BytesMonitor
54+
cgoMemTarget uint64
5355
}
5456

5557
// startSampleEnvironment starts a periodic loop that samples the environment and,
@@ -70,16 +72,17 @@ func startSampleEnvironment(
7072
metricsSampleInterval = p.EnvironmentSampleInterval
7173
}
7274
cfg := sampleEnvironmentCfg{
73-
st: srvCfg.Settings,
74-
stopper: stopper,
75-
minSampleInterval: metricsSampleInterval,
76-
goroutineDumpDirName: srvCfg.GoroutineDumpDirName,
77-
heapProfileDirName: srvCfg.HeapProfileDirName,
78-
cpuProfileDirName: srvCfg.CPUProfileDirName,
79-
runtime: runtimeSampler,
80-
sessionRegistry: sessionRegistry,
81-
rootMemMonitor: rootMemMonitor,
82-
cgoMemTarget: max(uint64(pebbleCacheSize), 128*1024*1024),
75+
st: srvCfg.Settings,
76+
stopper: stopper,
77+
minSampleInterval: metricsSampleInterval,
78+
goroutineDumpDirName: srvCfg.GoroutineDumpDirName,
79+
heapProfileDirName: srvCfg.HeapProfileDirName,
80+
cpuProfileDirName: srvCfg.CPUProfileDirName,
81+
executionTraceDirName: srvCfg.ExecutionTraceDirName,
82+
runtime: runtimeSampler,
83+
sessionRegistry: sessionRegistry,
84+
rootMemMonitor: rootMemMonitor,
85+
cgoMemTarget: max(uint64(pebbleCacheSize), 128*1024*1024),
8386
}
8487
// Immediately record summaries once on server startup.
8588

@@ -155,6 +158,15 @@ func startSampleEnvironment(
155158
}
156159
}
157160

161+
simpleFlightRecorder, err := goexectrace.NewFlightRecorder(cfg.st, 10*time.Second, cfg.executionTraceDirName)
162+
if err != nil {
163+
log.Warningf(ctx, "failed to initialize flight recorder: %v", err)
164+
}
165+
err = simpleFlightRecorder.Start(ctx, cfg.stopper)
166+
if err != nil {
167+
log.Warningf(ctx, "failed to start flight recorder: %v", err)
168+
}
169+
158170
return cfg.stopper.RunAsyncTaskEx(ctx,
159171
stop.TaskOpts{TaskName: "mem-logger", SpanOpt: stop.SterileRootSpan},
160172
func(ctx context.Context) {

pkg/server/get_local_files_test.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,28 @@ func TestGetLocalFiles(t *testing.T) {
4141
if err := os.MkdirAll(testCPUDir, os.ModePerm); err != nil {
4242
t.Fatal(err)
4343
}
44+
testExecutionTraceDir := filepath.Join(tempDir, "execution-trace")
45+
if err := os.MkdirAll(testExecutionTraceDir, os.ModePerm); err != nil {
46+
t.Fatal(err)
47+
}
4448

4549
t.Run("read files successfully", func(t *testing.T) {
46-
for _, test := range []string{"heap", "cpu"} {
50+
for _, test := range []string{"heap", "cpu", "execution-trace"} {
4751
t.Run(test, func(t *testing.T) {
4852
testDir, typ := testHeapDir, serverpb.FileType_HEAP
4953
if test == "cpu" {
5054
testDir, typ = testCPUDir, serverpb.FileType_CPU
5155
}
56+
if test == "execution-trace" {
57+
testDir, typ = testExecutionTraceDir, serverpb.FileType_EXECUTIONTRACE
58+
}
5259
fileName := fmt.Sprintf("im-in-the-%s-dir", test)
5360
testFile := filepath.Join(testDir, fileName)
5461
if err := os.WriteFile(testFile, []byte(fmt.Sprintf("%s stuff", test)), 0644); err != nil {
5562
t.Fatal(err)
5663
}
5764
req := &serverpb.GetFilesRequest{NodeId: "local", Type: typ, Patterns: []string{"*"}}
58-
res, err := getLocalFiles(req, testHeapDir, "", testCPUDir, os.Stat, os.ReadFile)
65+
res, err := getLocalFiles(req, testHeapDir, "", testCPUDir, testExecutionTraceDir, os.Stat, os.ReadFile)
5966
require.NoError(t, err)
6067
require.Equal(t, 1, len(res.Files))
6168
require.Equal(t, fileName, res.Files[0].Name)
@@ -74,7 +81,7 @@ func TestGetLocalFiles(t *testing.T) {
7481
}
7582
req := &serverpb.GetFilesRequest{
7683
NodeId: "local", Type: serverpb.FileType_HEAP, Patterns: []string{"*"}}
77-
_, err := getLocalFiles(req, testHeapDir, "", "", statFileWithErr, os.ReadFile)
84+
_, err := getLocalFiles(req, testHeapDir, "", "", "", statFileWithErr, os.ReadFile)
7885
require.ErrorContains(t, err, "stat error")
7986
})
8087

@@ -89,24 +96,29 @@ func TestGetLocalFiles(t *testing.T) {
8996
}
9097
req := &serverpb.GetFilesRequest{
9198
NodeId: "local", Type: serverpb.FileType_HEAP, Patterns: []string{"*"}}
92-
_, err := getLocalFiles(req, testHeapDir, "", "", os.Stat, readFileWithErr)
99+
_, err := getLocalFiles(req, testHeapDir, "", "", "", os.Stat, readFileWithErr)
93100
require.ErrorContains(t, err, "read error")
94101
})
95102

96103
t.Run("dirs not implemented", func(t *testing.T) {
97104
req := &serverpb.GetFilesRequest{
98105
NodeId: "local", Type: serverpb.FileType_HEAP, Patterns: []string{"*"}}
99-
_, err := getLocalFiles(req, "", "nonexistent", "nonexistent", os.Stat, os.ReadFile)
106+
_, err := getLocalFiles(req, "", "nonexistent", "nonexistent", "", os.Stat, os.ReadFile)
100107
require.ErrorContains(t, err, "dump directory not configured")
101108

102109
req = &serverpb.GetFilesRequest{
103110
NodeId: "local", Type: serverpb.FileType_GOROUTINES, Patterns: []string{"*"}}
104-
_, err = getLocalFiles(req, "nonexistent", "", "nonexistent", os.Stat, os.ReadFile)
111+
_, err = getLocalFiles(req, "nonexistent", "", "nonexistent", "", os.Stat, os.ReadFile)
105112
require.ErrorContains(t, err, "dump directory not configured")
106113

107114
req = &serverpb.GetFilesRequest{
108115
NodeId: "local", Type: serverpb.FileType_CPU, Patterns: []string{"*"}}
109-
_, err = getLocalFiles(req, "nonexistent", "nonexistent", "", os.Stat, os.ReadFile)
116+
_, err = getLocalFiles(req, "nonexistent", "nonexistent", "", "", os.Stat, os.ReadFile)
117+
require.ErrorContains(t, err, "dump directory not configured")
118+
119+
req = &serverpb.GetFilesRequest{
120+
NodeId: "local", Type: serverpb.FileType_EXECUTIONTRACE, Patterns: []string{"*"}}
121+
_, err = getLocalFiles(req, "nonexistent", "nonexistent", "nonexistent", "", os.Stat, os.ReadFile)
110122
require.ErrorContains(t, err, "dump directory not configured")
111123
})
112124
}

pkg/server/server_controller_new_server.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ func makeSharedProcessTenantServerConfig(
239239
baseCfg.DefaultZoneConfig = kvServerCfg.BaseConfig.DefaultZoneConfig
240240
baseCfg.HeapProfileDirName = kvServerCfg.BaseConfig.HeapProfileDirName
241241
baseCfg.CPUProfileDirName = kvServerCfg.BaseConfig.CPUProfileDirName
242+
baseCfg.ExecutionTraceDirName = kvServerCfg.BaseConfig.ExecutionTraceDirName
242243
baseCfg.GoroutineDumpDirName = kvServerCfg.BaseConfig.GoroutineDumpDirName
243244

244245
// The ListenerFactory allows us to dynamically choose a
@@ -294,6 +295,7 @@ func makeSharedProcessTenantServerConfig(
294295
baseCfg.GoroutineDumpDirName = ""
295296
baseCfg.HeapProfileDirName = ""
296297
baseCfg.CPUProfileDirName = ""
298+
baseCfg.ExecutionTraceDirName = ""
297299

298300
// Expose the process-wide runtime metrics to the tenant's metric
299301
// collector. Since they are process-wide, all tenants can see them.

pkg/server/serverpb/status.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ enum FileType {
794794
HEAP = 0;
795795
GOROUTINES = 1;
796796
CPU = 2;
797+
EXECUTIONTRACE = 3;
797798
}
798799

799800
message File {

0 commit comments

Comments
 (0)