Skip to content

Commit 444ffd0

Browse files
committed
drt: scripts and YAML for upgrade-test
This PR adds the scripts and YAML for running the upgrade testing. Epic: None Release: None
1 parent 342ea34 commit 444ffd0

File tree

4 files changed

+307
-0
lines changed

4 files changed

+307
-0
lines changed
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Yaml for creating and configuring the drt-upgrade-test and workload-upgrade-test clusters. This also configures the datadog.
2+
environment:
3+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
4+
ROACHPROD_DNS: drt.crdb.io
5+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
6+
ROACHPROD_GCE_DNS_ZONE: drt
7+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
8+
CLUSTER: drt-upgrade-test
9+
CLUSTER_NODES: 6
10+
WORKLOAD_CLUSTER: workload-upgrade-test
11+
WORKLOAD_NODES: 1
12+
COCKROACH_VERSION: v24.3.8
13+
14+
targets:
15+
- target_name: $CLUSTER
16+
steps:
17+
- command: create
18+
args:
19+
- $CLUSTER
20+
flags:
21+
clouds: gce
22+
gce-managed: true
23+
gce-enable-multiple-stores: true
24+
gce-zones: "us-east1-c"
25+
nodes: $CLUSTER_NODES
26+
gce-machine-type: n2-standard-8
27+
os-volume-size: 100
28+
local-ssd: true
29+
gce-local-ssd-count: 4
30+
username: drt
31+
lifetime: 8760h
32+
gce-image: "ubuntu-2204-jammy-v20240319"
33+
on_rollback:
34+
- command: destroy
35+
args:
36+
- $CLUSTER
37+
- command: sync
38+
flags:
39+
clouds: gce
40+
- command: stage
41+
args:
42+
- $CLUSTER
43+
- release
44+
- $COCKROACH_VERSION
45+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
46+
- command: start
47+
args:
48+
- $CLUSTER
49+
- "--binary"
50+
- "./cockroach"
51+
flags:
52+
enable-fluent-sink: true
53+
store-count: 4
54+
args: --wal-failover=among-stores
55+
restart: false
56+
sql-port: 26257
57+
on_rollback:
58+
- command: stop
59+
args:
60+
- $CLUSTER
61+
- command: run
62+
args:
63+
- $CLUSTER
64+
- --
65+
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
66+
- target_name: $WORKLOAD_CLUSTER
67+
steps:
68+
- command: create
69+
args:
70+
- $WORKLOAD_CLUSTER
71+
flags:
72+
clouds: gce
73+
gce-zones: "us-east1-c"
74+
nodes: $WORKLOAD_NODES
75+
gce-machine-type: n2-standard-4
76+
os-volume-size: 100
77+
username: workload
78+
lifetime: 8760h
79+
on_rollback:
80+
- command: destroy
81+
args:
82+
- $WORKLOAD_CLUSTER
83+
- command: sync
84+
flags:
85+
clouds: gce
86+
- command: stage
87+
args:
88+
- $WORKLOAD_CLUSTER
89+
- cockroach
90+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
91+
- command: put
92+
args:
93+
- $WORKLOAD_CLUSTER
94+
- artifacts/drtprod
95+
- command: put
96+
args:
97+
- $WORKLOAD_CLUSTER
98+
- artifacts/roachtest
99+
- roachtest-operations
100+
- command: put
101+
args:
102+
- $WORKLOAD_CLUSTER
103+
- pkg/cmd/drtprod/scripts/mixed_version.sh
104+
- command: run
105+
args:
106+
- $WORKLOAD_CLUSTER
107+
- --
108+
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; sudo systemctl start cron.service ; echo \"crontab -l ; echo '0 4 * * * /home/ubuntu/mixed_version.sh $CLUSTER >> /home/ubuntu/mixed_version.log' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
109+
- target_name: post_tasks
110+
dependent_targets:
111+
- $CLUSTER
112+
- $WORKLOAD_CLUSTER
113+
steps:
114+
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
115+
- script: rm
116+
args:
117+
- -rf
118+
- certs-$CLUSTER
119+
- command: get
120+
args:
121+
- $CLUSTER:1
122+
- certs
123+
- certs-$CLUSTER
124+
- command: ssh
125+
args:
126+
- $WORKLOAD_CLUSTER
127+
- --
128+
- sudo
129+
- rm
130+
- -rf
131+
- certs
132+
- command: put
133+
args:
134+
- $WORKLOAD_CLUSTER
135+
- certs-$CLUSTER
136+
- certs
137+
- command: ssh
138+
args:
139+
- $WORKLOAD_CLUSTER
140+
- --
141+
- chmod
142+
- 600
143+
- './certs/*'
144+
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
145+
args:
146+
- cct_tpcc # suffix added to script name tpcc_init_cct_tpcc.sh
147+
- false # determines whether to execute the script immediately on workload node
148+
flags:
149+
warehouses: 12000
150+
db: cct_tpcc
151+
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
152+
args:
153+
- cct_tpcc # suffix added to script name tpcc_run.sh
154+
- true # determines whether to execute the script immediately on workload node
155+
flags:
156+
db: cct_tpcc
157+
warehouses: 12000
158+
max-rate: 500
159+
workers: 50
160+
conns: 50
161+
duration: 12h
162+
ramp: 10m
163+
wait: 0

pkg/cmd/drtprod/scripts/generate_tpcc_run.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ for ((NODE=0; NODE<WORKLOAD_NODES; NODE++)); do
100100
#!/usr/bin/env bash
101101
102102
./drtprod sync
103+
$([ "$execute_script" = "true" ] && [ "$NODE" -eq 0 ] && echo "${pwd}/tpcc_init_${suffix}.sh")
103104
PGURLS=\$(./drtprod pgurl $CLUSTER | sed s/\'//g)
104105
read -r -a PGURLS_ARR <<< "\$PGURLS"
105106
j=0
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 The Cockroach Authors.
4+
#
5+
# Use of this software is governed by the CockroachDB Software License
6+
# included in the /LICENSE file.
7+
8+
# This script schedules daily maintenance commands on a 2‐week cycle.
9+
# Optional parameters OLD_RELEASE and NEW_RELEASE can be provided.
10+
# Usage: ./mixed_version.sh <CLUSTER> [OLD_RELEASE=v24.3.8] [NEW_RELEASE=v25.2.0-alpha.2]
11+
12+
# Schedule of commands:
13+
#
14+
# Week 1:
15+
# - Tuesday:
16+
# - Wipe the cluster
17+
# - Full cluster restart with OLD_RELEASE
18+
# - Set cluster.preserve_downgrade_option to OLD_RELEASE
19+
# - upgrade nodes 1-3 to NEW_RELEASE (50% nodes)
20+
# - Start tpcc_init_cct_tpcc.sh and roachtest_operations_run.sh
21+
# - Thursday:
22+
# - Deploy NEW_RELEASE to the entire cluster (not finalized)
23+
# - Friday:
24+
# - Revert all nodes to OLD_RELEASE
25+
# - upgrade nodes 1-2 to NEW_RELEASE (33% nodes)
26+
#
27+
# Week 2:
28+
# - Monday:
29+
# - Upgrade nodes 3-5 to NEW_RELEASE (80% nodes as 1,2 were already upgraded)
30+
# - Friday:
31+
# - Reset cluster.preserve_downgrade_option
32+
# - Upgrade node 6 to NEW_RELEASE (100% nodes as 1-5 were already upgraded)
33+
34+
CLUSTER="$1"
35+
if [ -z "$CLUSTER" ]; then
36+
echo "Usage: $0 <CLUSTER> [OLD_RELEASE] [NEW_RELEASE]"
37+
exit 1
38+
fi
39+
40+
/home/ubuntu/drtprod sync
41+
42+
# Set optional release versions
43+
OLD_RELEASE="${2:-v24.3.8}"
44+
NEW_RELEASE="${3:-v25.2.0-alpha.2}"
45+
46+
# Get today's day of week (1 for Monday, ... 7 for Sunday) and date
47+
day_of_week=$(date +%u)
48+
today=$(date +%F)
49+
cycle_file="/home/ubuntu/.cycle_info.txt"
50+
51+
if [ -f "$cycle_file" ]; then
52+
read saved_cycle saved_day < "$cycle_file"
53+
else
54+
# Initialize cycle_week to 0 if no previous info exists
55+
saved_cycle=0
56+
saved_day=$today
57+
fi
58+
59+
# On Monday, if this is the first run of today, flip the cycle week
60+
if [ "$day_of_week" -eq 1 ] && [ "$saved_day" != "$today" ]; then
61+
cycle_week=$((1 - saved_cycle))
62+
else
63+
cycle_week=$saved_cycle
64+
fi
65+
66+
# Save the cycle week and today's date for persistence
67+
echo "$cycle_week $today" > "$cycle_file"
68+
69+
# Use an array to store multiple commands
70+
cmds=()
71+
72+
if [ "$day_of_week" -eq 1 ] && [ "$cycle_week" -eq 1 ]; then
73+
# Week 2 - Monday
74+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER:3-5 release $NEW_RELEASE")
75+
elif [ "$day_of_week" -eq 2 ] && [ "$cycle_week" -eq 0 ]; then
76+
# Tuesday in Week 1 only
77+
cmds+=("sudo systemctl stop tpcc_run_cct_tpcc")
78+
cmds+=("sudo systemctl stop roachtest_ops")
79+
cmds+=("/home/ubuntu/drtprod stop $CLUSTER")
80+
cmds+=("/home/ubuntu/drtprod wipe $CLUSTER")
81+
cmds+=("/home/ubuntu/drtprod stage $CLUSTER release $OLD_RELEASE")
82+
cmds+=("/home/ubuntu/drtprod start $CLUSTER --binary ./cockroach --args=--wal-failover=among-stores --enable-fluent-sink=true --restart=false --sql-port=26257 --store-count=4")
83+
version=$(echo "$OLD_RELEASE" | sed -E 's/^v([0-9]+\.[0-9]+)\..*/\1/')
84+
cmds+=("/home/ubuntu/drtprod sql $CLUSTER:1 -- -e \"SET CLUSTER SETTING cluster.preserve_downgrade_option ='$version'\"")
85+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER:1-3 release $NEW_RELEASE")
86+
cmds+=("rm -rf /home/ubuntu/certs")
87+
cmds+=("/home/ubuntu/drtprod get $CLUSTER:1 certs /home/ubuntu/certs")
88+
cmds+=("chmod 600 /home/ubuntu/certs/*")
89+
cmds+=("/home/ubuntu/tpcc_init_cct_tpcc.sh")
90+
cmds+=("sudo systemd-run --unit tpcc_run_cct_tpcc --same-dir --uid $(id -u) --gid $(id -g) bash /home/ubuntu/tpcc_run_cct_tpcc.sh")
91+
cmds+=("sleep 30")
92+
# Note that roachtest_operations_run.sh needs to be setup manually for the first time.
93+
cmds+=("sudo systemd-run --unit roachtest_ops --same-dir --uid $(id -u) --gid $(id -g) bash /home/ubuntu/roachtest_operations_run.sh")
94+
elif [ "$day_of_week" -eq 4 ] && [ "$cycle_week" -eq 0 ]; then
95+
# Thursday in Week 1 only
96+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER:4-6 release $NEW_RELEASE")
97+
elif [ "$day_of_week" -eq 5 ]; then
98+
# Friday for both Weeks
99+
if [ "$cycle_week" -eq 0 ]; then
100+
# Week 1 friday commands
101+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER release $OLD_RELEASE")
102+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER:1-2 release $NEW_RELEASE")
103+
else
104+
# Week 2 friday commands
105+
cmds+=("/home/ubuntu/drtprod sql $CLUSTER:1 -- -e 'RESET CLUSTER SETTING cluster.preserve_downgrade_option'")
106+
cmds+=("/home/ubuntu/drtprod deploy $CLUSTER:6 release $NEW_RELEASE")
107+
fi
108+
fi
109+
110+
# Always check the status of the cluster
111+
cmds+=("/home/ubuntu/drtprod status $CLUSTER")
112+
113+
if [ ${#cmds[@]} -gt 0 ]; then
114+
for cmd in "${cmds[@]}"; do
115+
echo "Executing: $cmd"
116+
if ! eval "$cmd"; then
117+
echo "Error executing: $cmd" >&2
118+
exit 1
119+
fi
120+
done
121+
else
122+
echo "No scheduled command for today."
123+
fi
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 The Cockroach Authors.
4+
#
5+
# Use of this software is governed by the CockroachDB Software License
6+
# included in the /LICENSE file.
7+
8+
if [ -z "${CLUSTER}" ]; then
9+
echo "environment CLUSTER is not set"
10+
exit 1
11+
fi
12+
13+
if [ -z "${WORKLOAD_CLUSTER}" ]; then
14+
echo "environment CLUSTER is not set"
15+
exit 1
16+
fi
17+
18+
# the ssh keys of all workload nodes should be setup on the crdb nodes for the operations
19+
drtprod ssh ${CLUSTER} -- "echo \"$(drtprod run ${WORKLOAD_CLUSTER} -- cat ./.ssh/id_rsa.pub|grep ssh-rsa)\" >> ./.ssh/authorized_keys"
20+

0 commit comments

Comments
 (0)