Skip to content

Commit 897f33d

Browse files
committed
drtprod: 300-node TPCC bench
This change adds the YAML automation required for the 300-node scale test for benchmarking TPCC for publishing results. It contains several cluster optimisations for the benchmark. It uses server side partitioning. It is tuned to the optimal parameters for the workload to achieve maximum tpmC. Epic: None Release note: None
1 parent a0ff08e commit 897f33d

File tree

1 file changed

+241
-0
lines changed

1 file changed

+241
-0
lines changed
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# Yaml for creating and configuring the drt-scale cluster. This also configures Datadog.
2+
# Build the drtprod and roachtest binaries (using --cross=linux) before running this script
3+
environment:
4+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
5+
ROACHPROD_DNS: drt.crdb.io
6+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
7+
ROACHPROD_GCE_DNS_ZONE: drt
8+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
9+
CLUSTER: drt-scale-300
10+
WORKLOAD_CLUSTER: workload-scale-300
11+
CLUSTER_NODES: 300
12+
RACKS: 300
13+
NODES_PER_ZONE: 100
14+
TOTAL_PARTITIONS: 300
15+
PARTITION_TYPE: partitions
16+
WORKLOAD_NODES: 12
17+
VERSION: v25.3.0-beta.3
18+
WAREHOUSES: 4000000
19+
20+
dependent_file_locations:
21+
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
22+
- pkg/cmd/drtprod/scripts/setup_datadog_workload
23+
- pkg/cmd/drtprod/scripts/tpcc_init.sh
24+
- pkg/cmd/drtprod/scripts/generate_tpcc_run.sh
25+
- pkg/cmd/drtprod/scripts/populate_workload_keys.sh
26+
- artifacts/roachtest
27+
- artifacts/drtprod
28+
29+
targets:
30+
# crdb cluster specs
31+
- target_name: $CLUSTER
32+
steps:
33+
- command: create
34+
args:
35+
- $CLUSTER
36+
flags:
37+
clouds: gce
38+
gce-managed: true
39+
gce-enable-multiple-stores: true
40+
gce-zones: "us-central1-a:$NODES_PER_ZONE,us-central1-b:$NODES_PER_ZONE,us-central1-c:$NODES_PER_ZONE"
41+
nodes: $CLUSTER_NODES
42+
gce-machine-type: n2-standard-16
43+
local-ssd: false
44+
gce-pd-volume-size: 2048
45+
gce-pd-volume-type: pd-ssd
46+
gce-pd-volume-count: 2
47+
os-volume-size: 100
48+
username: drt
49+
lifetime: 8760h
50+
gce-image: "ubuntu-2204-jammy-v20250112"
51+
- command: sync
52+
flags:
53+
clouds: gce
54+
- command: stage
55+
args:
56+
- $CLUSTER
57+
- release
58+
- $VERSION
59+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
60+
- command: start
61+
args:
62+
- $CLUSTER
63+
- "--binary"
64+
- "./cockroach"
65+
flags:
66+
# add flag to set provisioned throughput on each store according to their cloud provider limits
67+
enable-fluent-sink: true
68+
store-count: 2
69+
args: --wal-failover=among-stores
70+
restart: false
71+
sql-port: 26257
72+
racks: $RACKS
73+
- command: sql
74+
args:
75+
- $CLUSTER:1
76+
- --
77+
- -e
78+
- "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='256 MB'"
79+
- command: sql
80+
args:
81+
- $CLUSTER:1
82+
- --
83+
- -e
84+
- "SET CLUSTER SETTING server.consistency_check.interval = '0s'"
85+
- command: sql
86+
args:
87+
- $CLUSTER:1
88+
- --
89+
- -e
90+
- "SET CLUSTER SETTING kv.range_merge.queue_enabled = false"
91+
- command: sql
92+
args:
93+
- $CLUSTER:1
94+
- --
95+
- -e
96+
- "SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false"
97+
- command: sql
98+
args:
99+
- $CLUSTER:1
100+
- --
101+
- -e
102+
- "SET CLUSTER SETTING rocksdb.min_wal_sync_interval = '500us'"
103+
- command: sql
104+
args:
105+
- $CLUSTER:1
106+
- --
107+
- -e
108+
- "SET CLUSTER SETTING admission.kv.enabled = false"
109+
- command: sql
110+
args:
111+
- $CLUSTER:1
112+
- --
113+
- -e
114+
- "SET CLUSTER SETTING kv.replication_reports.interval = '0s'"
115+
- command: sql
116+
args:
117+
- $CLUSTER:1
118+
- --
119+
- -e
120+
- "ALTER RANGE default CONFIGURE ZONE USING gc.ttlseconds = 600"
121+
- command: sql
122+
args:
123+
- $CLUSTER:1
124+
- --
125+
- -e
126+
- "SET CLUSTER SETTING storage.columnar_blocks.enabled = true"
127+
- command: sql
128+
args:
129+
- $CLUSTER:1
130+
- --
131+
- -e
132+
- "SET CLUSTER SETTING server.goroutine_dump.num_goroutines_threshold = '10000000'"
133+
- command: sql
134+
args:
135+
- $CLUSTER:1
136+
- --
137+
- -e
138+
- "SET CLUSTER SETTING storage.max_sync_duration.fatal.enabled = false"
139+
- command: sql
140+
args:
141+
- $CLUSTER:1
142+
- --
143+
- -e
144+
- "ALTER RANGE default CONFIGURE ZONE USING num_replicas = 5"
145+
- command: sql
146+
args:
147+
- $CLUSTER:1
148+
- --
149+
- -e
150+
- "ALTER DATABASE system CONFIGURE ZONE USING num_replicas = 5"
151+
- command: sql
152+
args:
153+
- $CLUSTER:1
154+
- --
155+
- -e
156+
- "SET CLUSTER SETTING kv.transaction.write_buffering.enabled = true"
157+
# workload cluster specs
158+
- target_name: $WORKLOAD_CLUSTER
159+
steps:
160+
- command: create
161+
args:
162+
- $WORKLOAD_CLUSTER
163+
flags:
164+
clouds: gce
165+
gce-zones: "us-central1-a"
166+
nodes: $WORKLOAD_NODES
167+
gce-machine-type: n2-standard-8
168+
os-volume-size: 100
169+
username: workload
170+
lifetime: 8760h
171+
gce-image: "ubuntu-2204-jammy-v20250112"
172+
on_rollback:
173+
- command: destroy
174+
args:
175+
- $WORKLOAD_CLUSTER
176+
- command: sync
177+
flags:
178+
clouds: gce
179+
- command: stage
180+
args:
181+
- $WORKLOAD_CLUSTER
182+
- release
183+
- $VERSION
184+
- command: put
185+
args:
186+
- $WORKLOAD_CLUSTER
187+
- artifacts/roachtest
188+
- roachtest-operations
189+
- command: put
190+
args:
191+
- $WORKLOAD_CLUSTER
192+
- artifacts/drtprod
193+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
194+
- target_name: post_tasks
195+
dependent_targets:
196+
- $CLUSTER
197+
- $WORKLOAD_CLUSTER
198+
steps:
199+
- script: rm
200+
args:
201+
- -rf
202+
- certs-$CLUSTER
203+
- command: fetch-certs
204+
args:
205+
- $CLUSTER:1
206+
- certs-$CLUSTER
207+
- command: put
208+
args:
209+
- $WORKLOAD_CLUSTER
210+
- certs-$CLUSTER
211+
- certs
212+
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
213+
args:
214+
- cct_tpcc
215+
- false
216+
flags:
217+
partitions: $TOTAL_PARTITIONS
218+
replicate-static-columns: true
219+
partition-strategy: leases
220+
warehouses: $WAREHOUSES
221+
db: cct_tpcc
222+
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
223+
- target_name: tpcc_run
224+
dependent_targets:
225+
- $CLUSTER
226+
- $WORKLOAD_CLUSTER
227+
steps:
228+
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
229+
args:
230+
- cct_tpcc
231+
- false
232+
flags:
233+
db: cct_tpcc
234+
warehouses: $WAREHOUSES
235+
active-warehouses: 333333
236+
workers: 333333
237+
conns: 1000
238+
active-workers: 1000
239+
duration: 12h
240+
ramp: 5m
241+
wait: 0

0 commit comments

Comments
 (0)