Skip to content

Commit 0fa914d

Browse files
craig[bot]herkolategan
andcommitted
Merge #149958
149958: drtprod: add 300-node tpcc bench test r=shailendra-patel a=herkolategan Previously, we used client-partitions for the scale 300 node test. This change adds support for server side partitions in the TPCC run script. This requires racks to be configured in the cluster, on start-up. Additionally we add the same optimizations present in the automated run [1]. This PR also fixes a few issues that we ran into while running the automation: - artifacts resolution - `signal: terminate` of gcloud CLI subprocesses on remote execution - distribution of `PGURLs` [1] https://github.com/cockroachdb/cockroach/blob/release-25.3/pkg/cmd/roachtest/tests/tpcc.go#L2448 Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents 9be69f8 + 897f33d commit 0fa914d

File tree

4 files changed

+294
-20
lines changed

4 files changed

+294
-20
lines changed

pkg/cmd/drtprod/cli/commands/yamlprocessor.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,17 @@ func setupAndExecute(
283283
// Move the drtprod binary to /usr/bin to ensure it is available system-wide on the cluster.
284284
err := roachprodRun(ctx, logger, monitorClusterName, "", "", true,
285285
os.Stdout, os.Stderr,
286-
[]string{fmt.Sprintf("sudo mv %s /usr/bin", drtprodLocation)},
286+
[]string{fmt.Sprintf("sudo cp %s /usr/bin", drtprodLocation)},
287+
install.RunOptions{FailOption: install.FailSlow})
288+
if err != nil {
289+
return err
290+
}
291+
292+
// Enable linger for the default user, so that the cloud subprocess is not
293+
// killed when the user logs out.
294+
err = roachprodRun(ctx, logger, monitorClusterName, "", "", true,
295+
os.Stdout, os.Stderr,
296+
[]string{fmt.Sprintf("sudo loginctl enable-linger %s", config.SharedUser)},
287297
install.RunOptions{FailOption: install.FailSlow})
288298
if err != nil {
289299
return err

pkg/cmd/drtprod/cli/commands/yamlprocessor_test.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ environment:
313313
roachprodRun = func(ctx context.Context, l *logger.Logger, clusterName,
314314
SSHOptions, processTag string, secure bool, stdout, stderr io.Writer,
315315
cmdArray []string, options install.RunOptions) error {
316-
if strings.HasPrefix(cmdArray[0], "sudo mv") {
316+
if strings.HasPrefix(cmdArray[0], "sudo cp") {
317317
return fmt.Errorf("move command failed")
318318
}
319319
return nil
@@ -419,11 +419,11 @@ environment:
419419
runCmds["mkdir"] = make([]string, 0)
420420
}
421421
runCmds["mkdir"] = append(runCmds["mkdir"], cmdArray[0])
422-
} else if strings.HasPrefix(cmdArray[0], "sudo mv") {
423-
if _, ok := runCmds["mv"]; !ok {
424-
runCmds["mv"] = make([]string, 0)
422+
} else if strings.HasPrefix(cmdArray[0], "sudo cp") {
423+
if _, ok := runCmds["cp"]; !ok {
424+
runCmds["cp"] = make([]string, 0)
425425
}
426-
runCmds["mv"] = append(runCmds["mv"], cmdArray[0])
426+
runCmds["cp"] = append(runCmds["cp"], cmdArray[0])
427427
} else if strings.HasPrefix(cmdArray[0], "sudo systemd-run") {
428428
if _, ok := runCmds["systemd"]; !ok {
429429
runCmds["systemd"] = make([]string, 0)
@@ -458,7 +458,7 @@ environment:
458458
t.Log(runCmds)
459459
require.Equal(t, 3, len(runCmds))
460460
require.Equal(t, 4, len(runCmds["mkdir"]))
461-
require.Equal(t, 1, len(runCmds["mv"]))
461+
require.Equal(t, 1, len(runCmds["cp"]))
462462
require.Equal(t, 1, len(runCmds["systemd"]))
463463
require.Equal(t, "sudo systemd-run --unit test-monitor --same-dir --uid $(id -u) --gid $(id -g) drtprod execute ./location/to/test.yaml",
464464
runCmds["systemd"][0])
@@ -493,11 +493,11 @@ environment:
493493
runCmds["mkdir"] = make([]string, 0)
494494
}
495495
runCmds["mkdir"] = append(runCmds["mkdir"], cmdArray[0])
496-
} else if strings.HasPrefix(cmdArray[0], "sudo mv") {
497-
if _, ok := runCmds["mv"]; !ok {
498-
runCmds["mv"] = make([]string, 0)
496+
} else if strings.HasPrefix(cmdArray[0], "sudo cp") {
497+
if _, ok := runCmds["cp"]; !ok {
498+
runCmds["cp"] = make([]string, 0)
499499
}
500-
runCmds["mv"] = append(runCmds["mv"], cmdArray[0])
500+
runCmds["cp"] = append(runCmds["cp"], cmdArray[0])
501501
} else if strings.HasPrefix(cmdArray[0], "sudo systemd-run") {
502502
if _, ok := runCmds["systemd"]; !ok {
503503
runCmds["systemd"] = make([]string, 0)
@@ -532,7 +532,7 @@ environment:
532532
t.Log(runCmds)
533533
require.Equal(t, 3, len(runCmds))
534534
require.Equal(t, 4, len(runCmds["mkdir"]))
535-
require.Equal(t, 1, len(runCmds["mv"]))
535+
require.Equal(t, 1, len(runCmds["cp"]))
536536
require.Equal(t, 1, len(runCmds["systemd"]))
537537
require.Equal(t, "sudo systemd-run --unit test-monitor --same-dir --uid $(id -u) --gid $(id -g) --setenv=DD_API_KEY=the_secret drtprod execute ./location/to/test.yaml",
538538
runCmds["systemd"][0])
@@ -563,11 +563,11 @@ environment:
563563
runCmds["mkdir"] = make([]string, 0)
564564
}
565565
runCmds["mkdir"] = append(runCmds["mkdir"], cmdArray[0])
566-
} else if strings.HasPrefix(cmdArray[0], "sudo mv") {
567-
if _, ok := runCmds["mv"]; !ok {
568-
runCmds["mv"] = make([]string, 0)
566+
} else if strings.HasPrefix(cmdArray[0], "sudo cp") {
567+
if _, ok := runCmds["cp"]; !ok {
568+
runCmds["cp"] = make([]string, 0)
569569
}
570-
runCmds["mv"] = append(runCmds["mv"], cmdArray[0])
570+
runCmds["cp"] = append(runCmds["cp"], cmdArray[0])
571571
} else if strings.HasPrefix(cmdArray[0], "sudo systemd-run") {
572572
if _, ok := runCmds["systemd"]; !ok {
573573
runCmds["systemd"] = make([]string, 0)
@@ -602,7 +602,7 @@ environment:
602602
t.Log(runCmds)
603603
require.Equal(t, 3, len(runCmds))
604604
require.Equal(t, 4, len(runCmds["mkdir"]))
605-
require.Equal(t, 1, len(runCmds["mv"]))
605+
require.Equal(t, 1, len(runCmds["cp"]))
606606
require.Equal(t, 1, len(runCmds["systemd"]))
607607
require.Equal(t, "sudo systemd-run --unit test-monitor --same-dir --uid $(id -u) --gid $(id -g) drtprod execute ./location/to/test.yaml -t target1",
608608
runCmds["systemd"][0])
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# Yaml for creating and configuring the drt-scale cluster. This also configures Datadog.
2+
# Build the drtprod and roachtest binaries (using --cross=linux) before running this script
3+
environment:
4+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
5+
ROACHPROD_DNS: drt.crdb.io
6+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
7+
ROACHPROD_GCE_DNS_ZONE: drt
8+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
9+
CLUSTER: drt-scale-300
10+
WORKLOAD_CLUSTER: workload-scale-300
11+
CLUSTER_NODES: 300
12+
RACKS: 300
13+
NODES_PER_ZONE: 100
14+
TOTAL_PARTITIONS: 300
15+
PARTITION_TYPE: partitions
16+
WORKLOAD_NODES: 12
17+
VERSION: v25.3.0-beta.3
18+
WAREHOUSES: 4000000
19+
20+
dependent_file_locations:
21+
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
22+
- pkg/cmd/drtprod/scripts/setup_datadog_workload
23+
- pkg/cmd/drtprod/scripts/tpcc_init.sh
24+
- pkg/cmd/drtprod/scripts/generate_tpcc_run.sh
25+
- pkg/cmd/drtprod/scripts/populate_workload_keys.sh
26+
- artifacts/roachtest
27+
- artifacts/drtprod
28+
29+
targets:
30+
# crdb cluster specs
31+
- target_name: $CLUSTER
32+
steps:
33+
- command: create
34+
args:
35+
- $CLUSTER
36+
flags:
37+
clouds: gce
38+
gce-managed: true
39+
gce-enable-multiple-stores: true
40+
gce-zones: "us-central1-a:$NODES_PER_ZONE,us-central1-b:$NODES_PER_ZONE,us-central1-c:$NODES_PER_ZONE"
41+
nodes: $CLUSTER_NODES
42+
gce-machine-type: n2-standard-16
43+
local-ssd: false
44+
gce-pd-volume-size: 2048
45+
gce-pd-volume-type: pd-ssd
46+
gce-pd-volume-count: 2
47+
os-volume-size: 100
48+
username: drt
49+
lifetime: 8760h
50+
gce-image: "ubuntu-2204-jammy-v20250112"
51+
- command: sync
52+
flags:
53+
clouds: gce
54+
- command: stage
55+
args:
56+
- $CLUSTER
57+
- release
58+
- $VERSION
59+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
60+
- command: start
61+
args:
62+
- $CLUSTER
63+
- "--binary"
64+
- "./cockroach"
65+
flags:
66+
# add flag to set provisioned throughput on each store according to their cloud provider limits
67+
enable-fluent-sink: true
68+
store-count: 2
69+
args: --wal-failover=among-stores
70+
restart: false
71+
sql-port: 26257
72+
racks: $RACKS
73+
- command: sql
74+
args:
75+
- $CLUSTER:1
76+
- --
77+
- -e
78+
- "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='256 MB'"
79+
- command: sql
80+
args:
81+
- $CLUSTER:1
82+
- --
83+
- -e
84+
- "SET CLUSTER SETTING server.consistency_check.interval = '0s'"
85+
- command: sql
86+
args:
87+
- $CLUSTER:1
88+
- --
89+
- -e
90+
- "SET CLUSTER SETTING kv.range_merge.queue_enabled = false"
91+
- command: sql
92+
args:
93+
- $CLUSTER:1
94+
- --
95+
- -e
96+
- "SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false"
97+
- command: sql
98+
args:
99+
- $CLUSTER:1
100+
- --
101+
- -e
102+
- "SET CLUSTER SETTING rocksdb.min_wal_sync_interval = '500us'"
103+
- command: sql
104+
args:
105+
- $CLUSTER:1
106+
- --
107+
- -e
108+
- "SET CLUSTER SETTING admission.kv.enabled = false"
109+
- command: sql
110+
args:
111+
- $CLUSTER:1
112+
- --
113+
- -e
114+
- "SET CLUSTER SETTING kv.replication_reports.interval = '0s'"
115+
- command: sql
116+
args:
117+
- $CLUSTER:1
118+
- --
119+
- -e
120+
- "ALTER RANGE default CONFIGURE ZONE USING gc.ttlseconds = 600"
121+
- command: sql
122+
args:
123+
- $CLUSTER:1
124+
- --
125+
- -e
126+
- "SET CLUSTER SETTING storage.columnar_blocks.enabled = true"
127+
- command: sql
128+
args:
129+
- $CLUSTER:1
130+
- --
131+
- -e
132+
- "SET CLUSTER SETTING server.goroutine_dump.num_goroutines_threshold = '10000000'"
133+
- command: sql
134+
args:
135+
- $CLUSTER:1
136+
- --
137+
- -e
138+
- "SET CLUSTER SETTING storage.max_sync_duration.fatal.enabled = false"
139+
- command: sql
140+
args:
141+
- $CLUSTER:1
142+
- --
143+
- -e
144+
- "ALTER RANGE default CONFIGURE ZONE USING num_replicas = 5"
145+
- command: sql
146+
args:
147+
- $CLUSTER:1
148+
- --
149+
- -e
150+
- "ALTER DATABASE system CONFIGURE ZONE USING num_replicas = 5"
151+
- command: sql
152+
args:
153+
- $CLUSTER:1
154+
- --
155+
- -e
156+
- "SET CLUSTER SETTING kv.transaction.write_buffering.enabled = true"
157+
# workload cluster specs
158+
- target_name: $WORKLOAD_CLUSTER
159+
steps:
160+
- command: create
161+
args:
162+
- $WORKLOAD_CLUSTER
163+
flags:
164+
clouds: gce
165+
gce-zones: "us-central1-a"
166+
nodes: $WORKLOAD_NODES
167+
gce-machine-type: n2-standard-8
168+
os-volume-size: 100
169+
username: workload
170+
lifetime: 8760h
171+
gce-image: "ubuntu-2204-jammy-v20250112"
172+
on_rollback:
173+
- command: destroy
174+
args:
175+
- $WORKLOAD_CLUSTER
176+
- command: sync
177+
flags:
178+
clouds: gce
179+
- command: stage
180+
args:
181+
- $WORKLOAD_CLUSTER
182+
- release
183+
- $VERSION
184+
- command: put
185+
args:
186+
- $WORKLOAD_CLUSTER
187+
- artifacts/roachtest
188+
- roachtest-operations
189+
- command: put
190+
args:
191+
- $WORKLOAD_CLUSTER
192+
- artifacts/drtprod
193+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
194+
- target_name: post_tasks
195+
dependent_targets:
196+
- $CLUSTER
197+
- $WORKLOAD_CLUSTER
198+
steps:
199+
- script: rm
200+
args:
201+
- -rf
202+
- certs-$CLUSTER
203+
- command: fetch-certs
204+
args:
205+
- $CLUSTER:1
206+
- certs-$CLUSTER
207+
- command: put
208+
args:
209+
- $WORKLOAD_CLUSTER
210+
- certs-$CLUSTER
211+
- certs
212+
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
213+
args:
214+
- cct_tpcc
215+
- false
216+
flags:
217+
partitions: $TOTAL_PARTITIONS
218+
replicate-static-columns: true
219+
partition-strategy: leases
220+
warehouses: $WAREHOUSES
221+
db: cct_tpcc
222+
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
223+
- target_name: tpcc_run
224+
dependent_targets:
225+
- $CLUSTER
226+
- $WORKLOAD_CLUSTER
227+
steps:
228+
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
229+
args:
230+
- cct_tpcc
231+
- false
232+
flags:
233+
db: cct_tpcc
234+
warehouses: $WAREHOUSES
235+
active-warehouses: 333333
236+
workers: 333333
237+
conns: 1000
238+
active-workers: 1000
239+
duration: 12h
240+
ramp: 5m
241+
wait: 0

0 commit comments

Comments
 (0)