Skip to content

Commit dd5bc70

Browse files
craig[bot]tbgdt
committed
157132: tracing: add minimal format suitable for datadriven tests r=wenyihu6 a=tbg I'd like to make the trace events emitted from deterministic multi-metric allocator tests part of the expected test outputs. I realized there wasn't a standardized deterministic and barebones output format, so I added one. In the test I have in mind I'll probably condense the file:line prefix further, but it seemed unsavory to slap regexp transformations into the tracing packages, and unfortunately that is required to solve the problem if we want to avoid changing the `log` package. Related to #157133. Epic: none 157133: mmaprototype: add tracing to testing for rebalanceStores r=tbg a=tbg Stacked on #157132. Don't review tracing pkg code here. ---- This PR adds the basic capability to exercise rebalanceStores through TestClusterState. It does this by moving rebalanceStores from allocatorState to clusterState, which is a small and mostly mechanical refactor. The immediate motivation behind this change is being able to exercise the functionality for #156776 while it is being added, while being able to see the internal tracepoints as well. Informs #156776. Epic: CRDB-55052 157150: cloud/azure: add optional azure client caching r=dt a=dt This adds a signle-slot, process-wide in-memory cache of the most recently used azure SDK storage client allowing subsequent attempts to open clients with the same configuration to reuse the already open client. This can allow the SDK's internal caching of things like IMDS tokens to be used across separate CRDB operations on separate files which currently each open their own client. This can be particularly impactful for avoiding hitting the IMDS rate limit. Release note: none. Epic: none. Co-authored-by: Tobias Grieger <[email protected]> Co-authored-by: David Taylor <[email protected]>
4 parents e9e92b1 + 7cc5bc9 + 754bacd + f2c51bf commit dd5bc70

File tree

19 files changed

+489
-49
lines changed

19 files changed

+489
-49
lines changed

pkg/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,7 @@ ALL_TESTS = [
835835
"//pkg/util/tracing/goexectrace:goexectrace_test",
836836
"//pkg/util/tracing/grpcinterceptor:grpcinterceptor_test",
837837
"//pkg/util/tracing/service:service_test",
838+
"//pkg/util/tracing/tracingpb:tracingpb_test",
838839
"//pkg/util/tracing:tracing_test",
839840
"//pkg/util/treeprinter:treeprinter_test",
840841
"//pkg/util/trigram:trigram_test",
@@ -2842,6 +2843,7 @@ GO_TARGETS = [
28422843
"//pkg/util/tracing/service:service",
28432844
"//pkg/util/tracing/service:service_test",
28442845
"//pkg/util/tracing/tracingpb:tracingpb",
2846+
"//pkg/util/tracing/tracingpb:tracingpb_test",
28452847
"//pkg/util/tracing/tracingservicepb:tracingservicepb",
28462848
"//pkg/util/tracing/tracingui:tracingui",
28472849
"//pkg/util/tracing/tracingutil:tracingutil",

pkg/cloud/azure/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ go_library(
2424
"//pkg/util/envutil",
2525
"//pkg/util/ioctx",
2626
"//pkg/util/log",
27+
"//pkg/util/syncutil",
2728
"//pkg/util/timeutil",
2829
"//pkg/util/tracing",
2930
"@com_github_azure_azure_sdk_for_go_sdk_azcore//:azcore",

pkg/cloud/azure/azure_storage.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/settings"
3030
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
3131
"github.com/cockroachdb/cockroach/pkg/util/ioctx"
32+
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
3233
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3334
"github.com/cockroachdb/cockroach/pkg/util/tracing"
3435
"github.com/cockroachdb/errors"
@@ -55,6 +56,13 @@ var tryTimeout = settings.RegisterDurationSetting(
5556
"the timeout for individual retry attempts in Azure operations",
5657
60*time.Second)
5758

59+
var reuseSession = settings.RegisterBoolSetting(
60+
settings.ApplicationLevel,
61+
"cloudstorage.azure.session_reuse.enabled",
62+
"persist the last opened azure client and re-use it when opening a new client with the same argument (some settings may take 2mins to take effect)",
63+
false,
64+
)
65+
5866
// A note on Azure authentication:
5967
//
6068
// The standardized way to authenticate a third-party identity to the Azure
@@ -214,6 +222,14 @@ type azureStorage struct {
214222
settings *cluster.Settings
215223
}
216224

225+
var azClientCache struct {
226+
syncutil.Mutex
227+
// TODO(dt): make this an >1 item cache e.g. add a FIFO ring.
228+
key cloudpb.ExternalStorage_Azure
229+
set time.Time
230+
client *service.Client
231+
}
232+
217233
var _ cloud.ExternalStorage = &azureStorage{}
218234

219235
func makeAzureStorage(
@@ -224,6 +240,7 @@ func makeAzureStorage(
224240
if conf == nil {
225241
return nil, errors.Errorf("azure upload requested but info missing")
226242
}
243+
227244
env, err := azure.EnvironmentFromName(conf.Environment)
228245
if err != nil {
229246
return nil, errors.Wrap(err, "azure environment")
@@ -256,6 +273,29 @@ func makeAzureStorage(
256273
opts.Retry.TryTimeout = tryTimeout.Get(&args.Settings.SV)
257274

258275
var azClient *service.Client
276+
277+
clientConf := *conf
278+
clientConf.Prefix = "" // Prefix is not part of the client identity.
279+
280+
if reuseSession.Get(&args.Settings.SV) {
281+
func() {
282+
azClientCache.Lock()
283+
defer azClientCache.Unlock()
284+
if cached := azClientCache.client; cached != nil && azClientCache.key == clientConf && timeutil.Since(azClientCache.set) < 2*time.Minute {
285+
azClient = cached
286+
}
287+
}()
288+
if azClient != nil {
289+
return &azureStorage{
290+
conf: conf,
291+
ioConf: args.IOConf,
292+
container: azClient.NewContainerClient(conf.Container),
293+
prefix: conf.Prefix,
294+
settings: args.Settings,
295+
}, nil
296+
}
297+
}
298+
259299
switch conf.Auth {
260300
case cloudpb.AzureAuth_LEGACY:
261301
credential, err := azblob.NewSharedKeyCredential(conf.AccountName, conf.AccountKey)
@@ -299,6 +339,14 @@ func makeAzureStorage(
299339
return nil, errors.Errorf("unsupported value %s for %s", conf.Auth, cloud.AuthParam)
300340
}
301341

342+
if reuseSession.Get(&args.Settings.SV) {
343+
azClientCache.Lock()
344+
defer azClientCache.Unlock()
345+
azClientCache.key = clientConf
346+
azClientCache.client = azClient
347+
azClientCache.set = timeutil.Now()
348+
}
349+
302350
return &azureStorage{
303351
conf: conf,
304352
ioConf: args.IOConf,

pkg/kv/kvserver/allocator/mmaprototype/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,10 @@ go_test(
5353
"//pkg/testutils/dd",
5454
"//pkg/util/randutil",
5555
"//pkg/util/timeutil",
56+
"//pkg/util/tracing",
5657
"@com_github_cockroachdb_datadriven//:datadriven",
5758
"@com_github_cockroachdb_errors//:errors",
59+
"@com_github_cockroachdb_redact//:redact",
5860
"@com_github_stretchr_testify//require",
5961
],
6062
)

pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
2222
"github.com/cockroachdb/cockroach/pkg/testutils/dd"
2323
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
24+
"github.com/cockroachdb/cockroach/pkg/util/tracing"
2425
"github.com/cockroachdb/datadriven"
26+
"github.com/cockroachdb/redact"
2527
"github.com/stretchr/testify/require"
2628
)
2729

@@ -519,8 +521,15 @@ func TestClusterState(t *testing.T) {
519521
storeID := dd.ScanArg[roachpb.StoreID](t, d, "store-id")
520522
rng := rand.New(rand.NewSource(0))
521523
dsm := newDiversityScoringMemo()
522-
cs.rebalanceStores(context.Background(), storeID, rng, dsm)
523-
return printPendingChangesTest(testingGetPendingChanges(t, cs))
524+
tr := tracing.NewTracer()
525+
tr.SetRedactable(true)
526+
defer tr.Close()
527+
ctx, finishAndGet := tracing.ContextWithRecordingSpan(context.Background(), tr, "rebalance-stores")
528+
cs.rebalanceStores(ctx, storeID, rng, dsm)
529+
rec := finishAndGet()
530+
var sb redact.StringBuilder
531+
rec.SafeFormatMinimal(&sb)
532+
return sb.String() + printPendingChangesTest(testingGetPendingChanges(t, cs))
524533

525534
case "tick":
526535
seconds := dd.ScanArg[int](t, d, "seconds")

pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_basic

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,15 @@ store-id=2 node-id=2 status=ok accepting all reported=[cpu:10, write-bandwidth:0
5555
store-id=3 node-id=3 status=ok accepting all reported=[cpu:10, write-bandwidth:0, byte-size:0] adjusted=[cpu:10, write-bandwidth:0, byte-size:0] node-reported-cpu=10 node-adjusted-cpu=10 seq=1
5656
top-k-ranges (local-store-id=1) dim=WriteBandwidth: r1
5757

58-
# TODO(tbg): maybe it'll be good to at least optionally be able to have the
59-
# trace printed in the output here.
58+
# s1 is overloaded and local, so rebalanceStores should try and succeed to shed
59+
# a lease from it.
6060
rebalance-stores store-id=1
6161
----
62+
[mmaid=1] cluster means: (stores-load [cpu:33, write-bandwidth:0, byte-size:0]) (stores-capacity [cpu:100, write-bandwidth:100, byte-size:100]) (nodes-cpu-load 33) (nodes-cpu-capacity 100)
63+
[mmaid=1] overload-continued s1 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period
64+
[mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate
65+
[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:60, write-bandwidth:0, byte-size:0]
66+
[mmaid=1] result(success): shedding r1 lease from s1 to s2 [change:r1=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:40, write-bandwidth:0, byte-size:0] target:[cpu:54, write-bandwidth:0, byte-size:0] (means: [cpu:33, write-bandwidth:0, byte-size:0]) (frac_pending: (src:0.00,target:0.50) (src:4.40,target:0.00))
6267
pending(2)
6368
change-id=1 store-id=1 node-id=1 range-id=1 load-delta=[cpu:-40, write-bandwidth:0, byte-size:0] start=0s
6469
prev=(replica-id=1 type=VOTER_FULL leaseholder=true)

pkg/util/tracing/tracingpb/BUILD.bazel

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
load("@rules_proto//proto:defs.bzl", "proto_library")
22
load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
3-
load("@io_bazel_rules_go//go:def.bzl", "go_library")
3+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
44

55
go_library(
66
name = "tracingpb",
@@ -51,3 +51,20 @@ go_proto_library(
5151
"@com_github_gogo_protobuf//gogoproto",
5252
],
5353
)
54+
55+
go_test(
56+
name = "tracingpb_test",
57+
srcs = [
58+
"condense_test.go",
59+
"recording_test.go",
60+
],
61+
data = glob(["testdata/**"]),
62+
embed = [":tracingpb"],
63+
deps = [
64+
"//pkg/testutils/datapathutils",
65+
"//pkg/testutils/echotest",
66+
"//pkg/util/log",
67+
"//pkg/util/tracing",
68+
"@com_github_cockroachdb_redact//:redact",
69+
],
70+
)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package tracingpb
7+
8+
import (
9+
"testing"
10+
11+
"github.com/cockroachdb/redact"
12+
)
13+
14+
func TestCondensePathLinePrefix(t *testing.T) {
15+
for _, test := range []struct {
16+
name string
17+
input string
18+
expected string
19+
}{
20+
{
21+
name: "no_directory",
22+
input: "foo.go:1234 message",
23+
expected: "message",
24+
},
25+
{
26+
name: "full_path",
27+
input: "pkg/foo/bar.go:123 message",
28+
expected: "message",
29+
},
30+
{
31+
// Won't redact because there are redaction markers in the path.
32+
// This doesn't happen in the real world but we want to make sure
33+
// condensePathLinePrefix doesn't mess with redaction at all.
34+
name: "redaction_in_path",
35+
input: "pkg/‹foo›/bar.go:123 message",
36+
expected: "pkg/‹foo›/bar.go:123 message",
37+
},
38+
{
39+
// Redaction in message is expected and okay, since
40+
// condensePathLinePrefix is written in a way to
41+
// preserve that.
42+
name: "redaction_in_message",
43+
input: "pkg/foo/bar.go:123 ‹sensitive› message",
44+
expected: "‹sensitive› message",
45+
},
46+
{
47+
// Not expecting it in the real world since we only call
48+
// condensePathLinePrefix in formatMinimal, but if we get formatFull, it
49+
// also works.
50+
name: "full_format",
51+
input: "event: pkg/foo/bar.go:1234 ‹sensitive› message",
52+
expected: "‹sensitive› message",
53+
},
54+
{
55+
name: "unredacted_trace",
56+
input: "‹pkg/foo/bar.go:1234 some message›",
57+
expected: "‹some message›",
58+
},
59+
{
60+
// Should never happen (there would be escaping). But either way, works
61+
// just the same.
62+
name: "unredacted_trace_nested_unescaped",
63+
input: "‹pkg/foo/bar.go:1234 some ‹message››",
64+
expected: "‹some ‹message››",
65+
},
66+
{
67+
name: "unredacted_trace_nested_escaped",
68+
input: "‹pkg/foo/bar.go:1234 " + string(redact.EscapeMarkers([]byte("some ‹message›"))) + "›",
69+
expected: "‹some ?message?›", // NB: this is verbatim the escaped string inserted above
70+
},
71+
} {
72+
t.Run(test.name, func(t *testing.T) {
73+
got := string(condensePathLinePrefix(redact.RedactableString(test.input)))
74+
if got != test.expected {
75+
t.Errorf("expected %q, got %q", test.expected, got)
76+
}
77+
})
78+
}
79+
}

0 commit comments

Comments
 (0)