Skip to content

Commit 27903f7

Browse files
authored
Add nightly throughput stress (#1173)
* add nightly throughput stress * temp push for manual testing * add fromJSON fix to parse string to int * fix, properly capture scenario exit code, correct sdk path * run failure steps even if cancelled * fix slack step - add webhook type * improve comment * update timeouts to reflect GitHub 6hour time limit on job * simplify workflow * pipe scenario failure correctly * correctly check job status * remove final reporting step, redundant
1 parent 7f794d0 commit 27903f7

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
name: Nightly Throughput Stress
2+
3+
on:
4+
schedule:
5+
# Run at 3 AM PST (11:00 UTC) - offset from existing nightly
6+
- cron: '00 11 * * *'
7+
push:
8+
branches:
9+
- add-nightly-throughput-stress-workflow
10+
workflow_dispatch:
11+
inputs:
12+
duration:
13+
description: 'Test duration (e.g., 6h, 1h)'
14+
required: false
15+
default: '5h'
16+
type: string
17+
timeout:
18+
description: 'Scenario timeout (should always be greater than duration)'
19+
required: false
20+
default: '5h30m'
21+
type: string
22+
job_timeout_minutes:
23+
description: 'GitHub Actions job timeout in minutes'
24+
required: false
25+
default: 360
26+
type: number
27+
28+
env:
29+
# Workflow configuration
30+
TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }}
31+
TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }}
32+
33+
# Logging and artifacts
34+
WORKER_LOG_DIR: /tmp/throughput-stress-logs
35+
36+
# Omes configuration
37+
OMES_REPO: temporalio/omes
38+
OMES_REF: main
39+
RUN_ID: ${{ github.run_id }}-throughput-stress
40+
41+
jobs:
42+
throughput-stress:
43+
runs-on: ubuntu-latest-4-cores
44+
timeout-minutes: ${{ fromJSON(inputs.job_timeout_minutes || vars.NIGHTLY_JOB_TIMEOUT_MINUTES || 360) }}
45+
46+
steps:
47+
- name: Print test configuration
48+
run: |
49+
echo "=== Throughput Stress Test Configuration ==="
50+
echo "Duration: $TEST_DURATION"
51+
echo "Timeout: $TEST_TIMEOUT"
52+
echo "Run ID: $RUN_ID"
53+
echo "=========================================="
54+
55+
- name: Checkout SDK
56+
uses: actions/checkout@v4
57+
with:
58+
submodules: recursive
59+
60+
- name: Checkout OMES
61+
uses: actions/checkout@v4
62+
with:
63+
repository: ${{ env.OMES_REPO }}
64+
ref: ${{ env.OMES_REF }}
65+
path: omes
66+
67+
- name: Setup Go
68+
uses: actions/setup-go@v5
69+
with:
70+
go-version-file: omes/go.mod
71+
cache-dependency-path: omes/go.sum
72+
73+
- name: Setup Rust
74+
uses: dtolnay/rust-toolchain@stable
75+
76+
- name: Setup Rust cache
77+
uses: Swatinem/rust-cache@v2
78+
with:
79+
workspaces: temporalio/bridge -> target
80+
81+
- name: Setup Python
82+
uses: actions/setup-python@v5
83+
with:
84+
python-version: "3.13"
85+
86+
- name: Install protoc
87+
uses: arduino/setup-protoc@v3
88+
with:
89+
version: '23.x'
90+
repo-token: ${{ secrets.GITHUB_TOKEN }}
91+
92+
- name: Setup uv
93+
uses: astral-sh/setup-uv@v5
94+
95+
- name: Install poethepoet
96+
run: uv tool install poethepoet
97+
98+
- name: Install dependencies
99+
run: uv sync --all-extras
100+
101+
- name: Build SDK
102+
run: poe build-develop
103+
104+
- name: Install Temporal CLI
105+
uses: temporalio/setup-temporal@v0
106+
107+
- name: Setup log directory
108+
run: mkdir -p $WORKER_LOG_DIR
109+
110+
- name: Start Temporal Server
111+
run: |
112+
temporal server start-dev \
113+
--db-filename temporal-throughput-stress.sqlite \
114+
--sqlite-pragma journal_mode=WAL \
115+
--sqlite-pragma synchronous=OFF \
116+
--headless &> $WORKER_LOG_DIR/temporal-server.log &
117+
118+
- name: Run throughput stress scenario with local SDK
119+
working-directory: omes
120+
run: |
121+
# This makes the pipeline return the exit code of the first failing command
122+
# Otherwise the output of the `tee` command will be used
123+
# (which is troublesome when the scenario fails but the `tee` command succeeds)
124+
set -o pipefail
125+
126+
# Use run-scenario-with-worker to build and run in one step
127+
# Pass the SDK directory as --version for local testing
128+
# Note: The hardcoded values below match OMES defaults, except:
129+
# - visibility-count-timeout: 5m (vs 3m default)
130+
# to give CI a bit more time for visibility consistency
131+
go run ./cmd run-scenario-with-worker \
132+
--scenario throughput_stress \
133+
--language python \
134+
--version $(pwd)/.. \
135+
--run-id $RUN_ID \
136+
--duration $TEST_DURATION \
137+
--timeout $TEST_TIMEOUT \
138+
--max-concurrent 10 \
139+
--option internal-iterations=10 \
140+
--option continue-as-new-after-iterations=3 \
141+
--option sleep-time=1s \
142+
--option visibility-count-timeout=5m \
143+
--option min-throughput-per-hour=1000 \
144+
2>&1 | tee $WORKER_LOG_DIR/scenario.log
145+
146+
- name: Upload logs on failure
147+
if: failure() || cancelled()
148+
uses: actions/upload-artifact@v4
149+
with:
150+
name: throughput-stress-logs
151+
path: ${{ env.WORKER_LOG_DIR }}
152+
retention-days: 30
153+
154+
- name: Notify Slack on failure
155+
if: failure() || cancelled()
156+
uses: slackapi/slack-github-action@v2
157+
with:
158+
webhook-type: incoming-webhook
159+
payload: |
160+
{
161+
"text": "Nightly Python throughput stress test failed",
162+
"blocks": [
163+
{
164+
"type": "section",
165+
"text": {
166+
"type": "mrkdwn",
167+
"text": "*Nightly Throughput Stress Failed* :x:\n\n*Duration:* ${{ env.TEST_DURATION }}\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Logs>\n*Triggered by:* ${{ github.event_name == 'schedule' && 'Scheduled' || github.actor }}"
168+
}
169+
}
170+
]
171+
}
172+
env:
173+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SDK_ALERTS_WEBHOOK }}

0 commit comments

Comments
 (0)