Skip to content

Commit 7d65e38

Browse files
craig[bot]shailendra-patelsrosenberg
committed
Merge #151514
151514: teamcity: add new CI job for PUA r=shailendra-patel a=shailendra-patel During each release, there is a need to run PUA. This new CI job allows triggering the PUA workflow on an as-needed basis for a specific release. The job generates a JSON file as output, which is consumed by the benchmarking portal to display the results. Epic: none Release note: None Co-authored-by: Shailendra Patel <[email protected]> Co-authored-by: Stan Rosenberg <[email protected]>
2 parents 20c3c8e + ff676fa commit 7d65e38

File tree

8 files changed

+188
-33
lines changed

8 files changed

+188
-33
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2025 The Cockroach Authors.
4+
#
5+
# Use of this software is governed by the CockroachDB Software License
6+
# included in the /LICENSE file.
7+
8+
set -exuo pipefail
9+
10+
dir="$(dirname $(dirname $(dirname $(dirname $(dirname "${0}")))))"
11+
12+
source "$dir/teamcity-support.sh" # For $root
13+
source "$dir/teamcity-bazel-support.sh" # For run_bazel
14+
#
15+
BAZEL_SUPPORT_EXTRA_DOCKER_ARGS="-e LITERAL_ARTIFACTS_DIR=$root/artifacts -e GOOGLE_APPLICATION_CREDENTIALS_CONTENT -e GOOGLE_SERVICE_ACCOUNT -e GOOGLE_PROJECT -e PUA_CONFIG -e CRDB_VERSION -e CRDB_UPGRADE_VERSION -e DD_API_KEY -e DD_APP_KEY" \
16+
run_bazel build/teamcity/internal/cockroach/pua/pua_run_impl.sh
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2025 The Cockroach Authors.
4+
#
5+
# Use of this software is governed by the CockroachDB Software License
6+
# included in the /LICENSE file.
7+
8+
set -exuo pipefail
9+
10+
export ROACHPROD_DISABLED_PROVIDERS=aws,azure,ibm
11+
export ROACHPROD_DISABLE_UPDATE_CHECK=true
12+
13+
export ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT=${GOOGLE_SERVICE_ACCOUNT:-teamcity-pua@cockroach-ephemeral.iam.gserviceaccount.com}
14+
export ROACHPROD_GCE_DEFAULT_PROJECT=${GOOGLE_PROJECT:-cockroach-ephemeral}
15+
16+
export ROACHPROD_DNS=${ROACHPROD_DNS:-roachprod.crdb.io}
17+
export ROACHPROD_GCE_DNS_ZONE=${ROACHPROD_GCE_DNS_ZONE:-roachprod}
18+
export ROACHPROD_GCE_DNS_DOMAIN=${ROACHPROD_GCE_DNS_DOMAIN:-roachprod.crdb.io}
19+
20+
# generate the ssh key if it doesn't exist.
21+
if [[ ! -f ~/.ssh/id_rsa.pub ]]; then
22+
ssh-keygen -q -C "teamcity-pua-bazel $(date)" -N "" -f ~/.ssh/id_rsa
23+
fi
24+
25+
# set up google credentials.
26+
if [[ "$GOOGLE_APPLICATION_CREDENTIALS_CONTENT" ]]; then
27+
echo "$GOOGLE_APPLICATION_CREDENTIALS_CONTENT" > creds.json
28+
gcloud auth activate-service-account --key-file=creds.json
29+
30+
# Set GOOGLE_APPLICATION_CREDENTIALS so that gcp go libraries can find it.
31+
export GOOGLE_APPLICATION_CREDENTIALS="$(pwd)/creds.json"
32+
else
33+
echo 'warning: $GOOGLE_APPLICATION_CREDENTIALS_CONTENT not set' >&2
34+
exit 1
35+
fi
36+
37+
# build the binaries: roachprod, roachtest, and drtprod.
38+
build() {
39+
config="crosslinux"
40+
# prepare the bin/ and artifacts/ directories.
41+
mkdir -p bin artifacts
42+
chmod o+rwx bin artifacts
43+
44+
# array of arguments to be passed to bazel for the component.
45+
bazel_args=()
46+
47+
# array of build artifacts. each item has format "src:dest"; src is relative to
48+
# the bazel-bin directory, dst is relative to cwd.
49+
artifacts=()
50+
51+
bazel_args+=(//pkg/cmd/roachtest)
52+
artifacts+=("pkg/cmd/roachtest/roachtest_/roachtest:bin/roachtest")
53+
artifacts+=("pkg/cmd/roachtest/roachtest_/roachtest:artifacts/roachtest")
54+
55+
bazel_args+=(//pkg/cmd/roachprod)
56+
artifacts+=("pkg/cmd/roachprod/roachprod_/roachprod:bin/roachprod")
57+
artifacts+=("pkg/cmd/roachprod/roachprod_/roachprod:artifacts/roachprod")
58+
59+
bazel_args+=(//pkg/cmd/drtprod)
60+
artifacts+=("pkg/cmd/drtprod/drtprod_/drtprod:bin/drtprod")
61+
artifacts+=("pkg/cmd/drtprod/drtprod_/drtprod:artifacts/drtprod")
62+
63+
bazel build --config $config -c opt "${bazel_args[@]}"
64+
BAZEL_BIN=$(bazel info bazel-bin --config $config -c opt)
65+
for artifact in "${artifacts[@]}"; do
66+
src=${artifact%%:*}
67+
dst=${artifact#*:}
68+
cp "$BAZEL_BIN/$src" "$dst"
69+
# Make files writable to simplify cleanup and copying (e.g., scp retry).
70+
chmod a+w "$dst"
71+
done
72+
73+
# add bin to path.
74+
export PATH=$PATH:$(pwd)/bin
75+
}
76+
77+
# run the build function.
78+
build
79+
80+
log_file="artifacts/pua.log"
81+
export config=${PUA_CONFIG:-"single_region"}
82+
if [[ "$config" == "single_region" ]]; then
83+
CLUSTER=drt-pua-9
84+
WORKLOAD=workload-pua-9
85+
ZONE_NODE=7-9
86+
config_file="pkg/cmd/drtprod/configs/drt_pua_9.yaml"
87+
elif [[ "$config" == "multi_region" ]]; then
88+
CLUSTER=drt-pua-15
89+
WORKLOAD=workload-pua-15
90+
ZONE_NODE=3-4
91+
config_file="pkg/cmd/drtprod/configs/drt_pua_mr.yaml"
92+
else
93+
echo "Error: Invalid PUA_CONFIG value: '$config'. Must be 'single_region' or 'multi_region'." >&2
94+
exit 1
95+
fi
96+
97+
# execute the pua benchmark test.
98+
drtprod execute ${config_file} | tee -a "${log_file}"
99+
100+
# the pua dashboard uses a json file to show the benchmark results.
101+
# we will generate the json file from the datadog metrics.
102+
# download metric converter from gcs bucket pua-backup-us-east1.
103+
mkdir -p datadog-metric-converter
104+
gsutil -m cp -r gs://pua-backup-us-east1/datadog-metric-converter/** datadog-metric-converter/
105+
106+
# install pip for python3.8.
107+
curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py -o get-pip.py
108+
python3 get-pip.py
109+
110+
# install the requirements for the metric converter.
111+
python3 -m pip install -r datadog-metric-converter/requirements.txt
112+
113+
# get the start and end time of the benchmark.
114+
epoch_start_time=$(grep "\[Phase-1: Baseline Performance\]" ${log_file} | grep "Starting" | awk -F'[][]' '{print $4}')
115+
epoch_start_time=$((epoch_start_time - 240))
116+
epoch_end_time=$(( $(date +%s) - 120 ))
117+
host=$(hostname)
118+
119+
# generate the benchmark.json file
120+
python3 datadog-metric-converter/convert-datadog-metric.py --start-time=${epoch_start_time} --end-time=${epoch_end_time} \
121+
--cluster-name ${CLUSTER} --workload-name ${WORKLOAD} \
122+
--monitor-host ${host} --zone-node ${ZONE_NODE}
123+
124+
125+
# delete the binaries - roachprod, roachtest and drtprod,
126+
# as we don't need them to be uploaded to TeamCity artifacts
127+
rm -f artifacts/roachprod artifacts/roachtest artifacts/drtprod
128+
cp benchmark.json "artifacts/benchmark.json"
129+
130+
rm -rf datadog-metric-converter
131+
132+
# destroy the clusters.
133+
drtprod destroy ${CLUSTER}
134+
drtprod destroy ${WORKLOAD}

pkg/cmd/drtprod/cli/commands/yamlprocessor.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
2525
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2626
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
27+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2728
"github.com/cockroachdb/errors"
2829
"github.com/spf13/cobra"
2930
"golang.org/x/exp/maps"
@@ -558,7 +559,7 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
558559
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitBefore)
559560
time.Sleep(time.Duration(cmd.waitBefore) * time.Second)
560561
}
561-
fmt.Printf("[%s] Starting <%v>\n", logPrefix, cmd)
562+
fmt.Printf("[%s] [%d] Starting <%v>\n", logPrefix, timeutil.Now().UTC().Unix(), cmd)
562563
err := commandExecutor(ctx, logPrefix, cmd.name, cmd.args...)
563564
if err != nil {
564565
if !cmd.continueOnFailure {
@@ -568,7 +569,7 @@ func executeCommands(ctx context.Context, logPrefix string, cmds []*command) err
568569
// Log the failure and continue if configured to do so
569570
fmt.Printf("[%s] Failed <%v>, Error Ignored: %v\n", logPrefix, cmd, err)
570571
} else {
571-
fmt.Printf("[%s] Completed <%v>\n", logPrefix, cmd)
572+
fmt.Printf("[%s] [%d] Completed <%v>\n", logPrefix, timeutil.Now().UTC().Unix(), cmd)
572573
if cmd.waitAfter > 0 {
573574
fmt.Printf("[%s] Waiting for %d seconds\n", logPrefix, cmd.waitAfter)
574575
time.Sleep(time.Duration(cmd.waitAfter) * time.Second)

pkg/cmd/drtprod/configs/drt_pua_9.yaml

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,12 @@
88
# Additionally, it configures Datadog and includes scripts for running workload and roachtest operations.
99

1010
environment:
11-
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
12-
ROACHPROD_DNS: drt.crdb.io
13-
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
14-
ROACHPROD_GCE_DNS_ZONE: drt
15-
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
1611
CLUSTER: drt-pua-9
1712
CLUSTER_NODES: 9
1813
WORKLOAD_CLUSTER: workload-pua-9
1914
WORKLOAD_NODES: 1
2015
STORE_COUNT: 2
21-
COCKROACH_VERSION: v25.2.0
22-
COCKROACH_UPGRADE_VERSION: v25.2.1
16+
COCKROACH_ROACHPROD_INSECURE: false
2317

2418
TPCC_WAREHOUSES: 5000
2519
TPCC_ACTIVE_WAREHOUSES: 5000
@@ -29,7 +23,7 @@ environment:
2923
CONNS: 1800
3024

3125
# GCP Cloud Storage bucket for storing backups
32-
BUCKET_US_EAST_1: cockroach-drt-backup-us-east1
26+
BUCKET_US_EAST_1: pua-backup-us-east1
3327

3428
dependent_file_locations:
3529
- artifacts/roachprod
@@ -60,7 +54,7 @@ targets:
6054
local-ssd: true
6155
gce-local-ssd-count: $STORE_COUNT
6256
username: drt
63-
lifetime: 8760h
57+
lifetime: 15h
6458
gce-image: "ubuntu-2204-jammy-v20240319"
6559
on_rollback:
6660
- command: destroy
@@ -73,7 +67,7 @@ targets:
7367
args:
7468
- $CLUSTER
7569
- release
76-
- $COCKROACH_VERSION
70+
- $CRDB_VERSION
7771
- script: "pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller"
7872
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
7973
- command: start
@@ -139,7 +133,7 @@ targets:
139133
gce-machine-type: n2-standard-8
140134
os-volume-size: 100
141135
username: workload
142-
lifetime: 8760h
136+
lifetime: 15h
143137
on_rollback:
144138
- command: destroy
145139
args:
@@ -221,7 +215,7 @@ targets:
221215
active-warehouses: $TPCC_ACTIVE_WAREHOUSES
222216
duration: $RUN_DURATION
223217
ramp: 5m
224-
wait_after: true
218+
wait: true
225219
max-conn-lifetime: $MAX_CONN_LIFETIME
226220
conns: $CONNS
227221
- script: "pkg/cmd/drtprod/scripts/pua_operations.sh"
@@ -277,7 +271,7 @@ targets:
277271
args:
278272
- $CLUSTER
279273
- release
280-
- $COCKROACH_UPGRADE_VERSION
274+
- $CRDB_UPGRADE_VERSION
281275
flags:
282276
pause: 5m
283277
grace-period: 500

pkg/cmd/drtprod/configs/drt_pua_mr.yaml

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,12 @@
77
# This yaml also creates a workload cluster with 3 nodes in 3 regions, 1 node in each region.
88
# This also configures datadog and scripts for running workload and roachtest operations.
99
environment:
10-
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
11-
ROACHPROD_DNS: drt.crdb.io
12-
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
13-
ROACHPROD_GCE_DNS_ZONE: drt
14-
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
1510
CLUSTER: drt-pua-15
1611
WORKLOAD_CLUSTER: workload-pua-15
1712
CLUSTER_NODES: 15
1813
WORKLOAD_NODES: 3
1914
STORE_COUNT: 2
20-
COCKROACH_VERSION: v25.2.0
21-
COCKROACH_UPGRADE_VERSION: v25.2.1
15+
COCKROACH_ROACHPROD_INSECURE: false
2216

2317
# variables used by tpcc_run_multiregion.sh
2418
NUM_REGIONS: 3
@@ -35,12 +29,13 @@ environment:
3529
MAX_CONN_LIFETIME: 3m
3630

3731
# GCP Cloud Storage bucket for storing locality-aware backups
38-
BUCKET_NORTH_AMERICA: cockroach-drt-backup
39-
BUCKET_US_EAST_5: cockroach-drt-backup-us-east5
40-
BUCKET_US_EAST_1: cockroach-drt-backup-us-east1
32+
BUCKET_US_CENTRAL1: pua-backup-us-central-1
33+
BUCKET_US_EAST_5: pua-backup-us-east5
34+
BUCKET_US_EAST_1: pua-backup-us-east1
4135

4236
dependent_file_locations:
4337
- artifacts/roachprod
38+
- artifacts/drtprod
4439
- artifacts/roachtest
4540
- pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller
4641
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
@@ -67,7 +62,7 @@ targets:
6762
gce-local-ssd-count: $STORE_COUNT
6863
os-volume-size: 100
6964
username: drt
70-
lifetime: 8760h
65+
lifetime: 15h
7166
on_rollback:
7267
- command: destroy
7368
args:
@@ -79,7 +74,7 @@ targets:
7974
args:
8075
- $CLUSTER
8176
- release
82-
- $COCKROACH_VERSION
77+
- $CRDB_VERSION
8378
- script: "pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller"
8479
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
8580
- command: start
@@ -157,7 +152,7 @@ targets:
157152
gce-machine-type: n2d-standard-4
158153
os-volume-size: 100
159154
username: workload
160-
lifetime: 8760h
155+
lifetime: 15h
161156
on_rollback:
162157
- command: destroy
163158
args:
@@ -178,6 +173,11 @@ targets:
178173
- $WORKLOAD_CLUSTER
179174
- artifacts/roachprod
180175
- roachprod
176+
- command: put
177+
args:
178+
- $WORKLOAD_CLUSTER
179+
- artifacts/drtprod
180+
- drtprod
181181
- command: put
182182
args:
183183
- $WORKLOAD_CLUSTER:1
@@ -260,7 +260,7 @@ targets:
260260
- --
261261
- -e
262262
- |
263-
BACKUP INTO ('gs://$BUCKET_NORTH_AMERICA/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=default',
263+
BACKUP INTO ('gs://$BUCKET_US_CENTRAL1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=default',
264264
'gs://$BUCKET_US_EAST_5/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east5',
265265
'gs://$BUCKET_US_EAST_1/$CLUSTER?AUTH=implicit&COCKROACH_LOCALITY=region%3Dus-east1')
266266
WITH OPTIONS (revision_history = true, detached)
@@ -283,7 +283,7 @@ targets:
283283
args:
284284
- $CLUSTER
285285
- release
286-
- $COCKROACH_UPGRADE_VERSION
286+
- $CRDB_UPGRADE_VERSION
287287
flags:
288288
pause: 5m
289289
grace-period: 500
@@ -375,3 +375,4 @@ targets:
375375
- $CLUSTER:11-15
376376
flags:
377377
restart: true
378+
wait_after: 3300

pkg/cmd/drtprod/scripts/generate_tpcc_run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ if [ -z "${PARTITION_TYPE}" ]; then
6565
fi
6666

6767
export ROACHPROD_DISABLED_PROVIDERS=IBM
68+
# Clusters created via drtprod default to secure mode. This script is intended to
69+
# run on secure clusters. In the past we have run into compatibility issues
70+
# between roachprod and drtprod. To make this script resilient to incompatibility
71+
# and allow use with insecure clusters, we can configure this env variable.
72+
export COCKROACH_ROACHPROD_INSECURE="${COCKROACH_ROACHPROD_INSECURE:-false}"
6873

6974
get_partitions_in_range() {
7075
local start=$(($1 - 1))
@@ -122,6 +127,7 @@ for ((NODE=0; NODE<WORKLOAD_NODES; NODE++)); do
122127
123128
export ROACHPROD_DISABLED_PROVIDERS=IBM
124129
export ROACHPROD_GCE_DEFAULT_PROJECT=$ROACHPROD_GCE_DEFAULT_PROJECT
130+
export COCKROACH_ROACHPROD_INSECURE="${COCKROACH_ROACHPROD_INSECURE:-false}"
125131
./drtprod sync
126132
127133
PGURLS=\$(./drtprod load-balancer pgurl $CLUSTER | sed s/\'//g)

pkg/cmd/drtprod/scripts/tpcc_init.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ if [ -z "${WORKLOAD_CLUSTER}" ]; then
3737
fi
3838

3939
export ROACHPROD_DISABLED_PROVIDERS=IBM
40+
export COCKROACH_ROACHPROD_INSECURE="${COCKROACH_ROACHPROD_INSECURE:-false}"
4041

4142
absolute_path=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "realpath ./cockroach")
4243
pwd=$(drtprod run "${WORKLOAD_CLUSTER}":1 -- "dirname ${absolute_path}")

0 commit comments

Comments
 (0)