diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml new file mode 100644 index 000000000..61b610133 --- /dev/null +++ b/.github/workflows/nightly-throughput-stress.yml @@ -0,0 +1,182 @@ +name: Nightly Throughput Stress + +on: + schedule: + # Run at 3 AM PST (11:00 UTC) - offset from existing nightly + - cron: '00 11 * * *' + push: + branches: + - add-nightly-throughput-stress-workflow + workflow_dispatch: + inputs: + duration: + description: 'Test duration (e.g., 6h, 1h)' + required: false + default: '6h' + type: string + timeout: + description: 'Scenario timeout (should always be 30m more than duration)' + required: false + default: '6h30m' + type: string + job_timeout_minutes: + description: 'GitHub Actions job timeout in minutes' + required: false + default: 420 + type: number + +env: + # Workflow configuration + TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '6h' }} + TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '6h30m' }} + + # Logging and artifacts + WORKER_LOG_DIR: /tmp/throughput-stress-logs + + # Omes configuration + OMES_REPO: temporalio/omes + OMES_REF: main + RUN_ID: ${{ github.run_id }}-throughput-stress + +jobs: + throughput-stress: + runs-on: ubuntu-latest-4-cores + timeout-minutes: ${{ fromJSON(inputs.job_timeout_minutes || vars.NIGHTLY_JOB_TIMEOUT_MINUTES || 420) }} + + steps: + - name: Print test configuration + run: | + echo "=== Throughput Stress Test Configuration ===" + echo "Duration: $TEST_DURATION" + echo "Timeout: $TEST_TIMEOUT" + echo "Run ID: $RUN_ID" + echo "==========================================" + + - name: Checkout SDK + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Checkout OMES + uses: actions/checkout@v4 + with: + repository: ${{ env.OMES_REPO }} + ref: ${{ env.OMES_REF }} + path: omes + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: omes/go.mod + cache-dependency-path: omes/go.sum + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Setup Rust cache + uses: Swatinem/rust-cache@v2 + with: + workspaces: temporalio/bridge -> target + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install protoc + uses: arduino/setup-protoc@v3 + with: + version: '23.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Setup uv + uses: astral-sh/setup-uv@v5 + + - name: Install poethepoet + run: uv tool install poethepoet + + - name: Install dependencies + run: uv sync --all-extras + + - name: Build SDK + run: poe build-develop + + - name: Install Temporal CLI + uses: temporalio/setup-temporal@v0 + + - name: Setup log directory + run: mkdir -p $WORKER_LOG_DIR + + - name: Start Temporal Server + run: | + temporal server start-dev \ + --db-filename temporal-throughput-stress.sqlite \ + --sqlite-pragma journal_mode=WAL \ + --sqlite-pragma synchronous=OFF \ + --headless &> $WORKER_LOG_DIR/temporal-server.log & + + - name: Run throughput stress scenario with local SDK + working-directory: omes + continue-on-error: true + run: | + # Use run-scenario-with-worker to build and run in one step + # Pass the SDK directory as --version for local testing + # Note: The hardcoded values below match OMES defaults, except: + # - visibility-count-timeout: 5m (vs 3m default) + # to give CI a bit more time for visibility consistency + go run ./cmd run-scenario-with-worker \ + --scenario throughput_stress \ + --language python \ + --version $(pwd)/.. \ + --run-id $RUN_ID \ + --duration $TEST_DURATION \ + --timeout $TEST_TIMEOUT \ + --max-concurrent 10 \ + --option internal-iterations=10 \ + --option continue-as-new-after-iterations=3 \ + --option sleep-time=1s \ + --option visibility-count-timeout=5m \ + --option min-throughput-per-hour=1000 \ + 2>&1 | tee $WORKER_LOG_DIR/scenario.log + + echo "SCENARIO_EXIT_CODE=${PIPESTATUS[0]}" >> $GITHUB_ENV + + - name: Upload logs on failure + if: env.SCENARIO_EXIT_CODE != '0' + uses: actions/upload-artifact@v4 + with: + name: throughput-stress-logs + path: ${{ env.WORKER_LOG_DIR }} + retention-days: 30 + + - name: Notify Slack on failure + if: env.SCENARIO_EXIT_CODE != '0' + uses: slackapi/slack-github-action@v2 + with: + webhook-type: incoming-webhook + payload: | + { + "text": "Nightly Python throughput stress test failed", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Nightly Throughput Stress Failed* :x:\n\n*Duration:* ${{ env.TEST_DURATION }}\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Logs>\n*Triggered by:* ${{ github.event_name == 'schedule' && 'Scheduled' || github.actor }}" + } + } + ] + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SDK_ALERTS_WEBHOOK }} + + - name: Fail if scenario failed + if: always() + run: | + if [ "${SCENARIO_EXIT_CODE:-1}" != "0" ]; then + echo "❌ Throughput stress test failed with exit code ${SCENARIO_EXIT_CODE}" + echo "Check the artifacts for detailed logs and state" + exit 1 + else + echo "✅ Throughput stress test completed successfully" + fi \ No newline at end of file