From bbb1215edfe4209db00e327e410171fa669fefbd Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Thu, 4 Dec 2025 16:44:28 -0800 Subject: [PATCH 1/2] upload nightly tps metrics to s3 --- .../workflows/nightly-throughput-stress.yml | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml index 10883b792..8d9ffc6d8 100644 --- a/.github/workflows/nightly-throughput-stress.yml +++ b/.github/workflows/nightly-throughput-stress.yml @@ -4,9 +4,6 @@ on: schedule: # Run at 3 AM PST (11:00 UTC) - offset from existing nightly - cron: '00 11 * * *' - push: - branches: - - add-nightly-throughput-stress-workflow workflow_dispatch: inputs: duration: @@ -33,6 +30,9 @@ env: TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }} TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }} + # AWS S3 metrics upload ARN + AWS_S3_METRICS_UPLOAD_ROLE_ARN: ${{ vars.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} + # Logging and artifacts WORKER_LOG_DIR: /tmp/throughput-stress-logs @@ -107,6 +107,14 @@ jobs: - name: Install Temporal CLI uses: temporalio/setup-temporal@v0 + - name: Install Prometheus + run: | + PROM_VERSION="3.8.0" + wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz + tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz + sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/ + prometheus --version + - name: Setup log directory run: mkdir -p $WORKER_LOG_DIR @@ -139,6 +147,11 @@ jobs: --duration $TEST_DURATION \ --timeout $TEST_TIMEOUT \ --max-concurrent 10 \ + --prom-listen-address 127.0.0.1:9091 \ + --worker-prom-listen-address 127.0.0.1:9092 \ + --prom-instance-addr 127.0.0.1:9090 \ + --prom-instance-config \ + --prom-export-worker-metrics $RUN_ID.parquet \ --option internal-iterations=10 \ --option continue-as-new-after-iterations=3 \ --option sleep-time=1s \ @@ -146,6 +159,25 @@ jobs: --option min-throughput-per-hour=1000 \ 2>&1 | tee $WORKER_LOG_DIR/scenario.log + - name: Configure AWS credentials + if: always() + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} + aws-region: us-west-2 + + - name: Upload metrics to S3 + if: always() + run: | + DATE=$(date +%Y-%m-%d) + # Use test/ prefix on non-main branches + PREFIX="language=python/date=$DATE" + if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then + PREFIX="test/$PREFIX" + fi + aws s3 cp omes/$RUN_ID.parquet \ + "s3://cloud-data-ingest-prod/github/sdk_load_test/$PREFIX/$RUN_ID.parquet" + - name: Upload logs on failure if: failure() || cancelled() uses: actions/upload-artifact@v4 From d0407c93e7d5b22c35c5164c96a3132b9220039e Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Fri, 5 Dec 2025 15:13:29 -0800 Subject: [PATCH 2/2] use env vars instead of var interpolation, add is_experiment input --- .../workflows/nightly-throughput-stress.yml | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nightly-throughput-stress.yml b/.github/workflows/nightly-throughput-stress.yml index 8d9ffc6d8..b9a2314e0 100644 --- a/.github/workflows/nightly-throughput-stress.yml +++ b/.github/workflows/nightly-throughput-stress.yml @@ -21,9 +21,15 @@ on: required: false default: 360 type: number + is_experiment: + description: 'Mark this run as an experiment (excluded from nightly dashboards)' + required: false + default: false + type: boolean permissions: contents: read + id-token: write env: # Workflow configuration @@ -41,6 +47,12 @@ env: OMES_REF: main RUN_ID: ${{ github.run_id }}-throughput-stress + # Prometheus version + PROM_VERSION: "3.8.0" + + # Language + SDK_LANG: "python" + jobs: throughput-stress: runs-on: ubuntu-latest-4-cores @@ -109,7 +121,6 @@ jobs: - name: Install Prometheus run: | - PROM_VERSION="3.8.0" wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/ @@ -141,7 +152,7 @@ jobs: # to give CI a bit more time for visibility consistency go run ./cmd run-scenario-with-worker \ --scenario throughput_stress \ - --language python \ + --language $SDK_LANG \ --version $(pwd)/.. \ --run-id $RUN_ID \ --duration $TEST_DURATION \ @@ -170,13 +181,14 @@ jobs: if: always() run: | DATE=$(date +%Y-%m-%d) - # Use test/ prefix on non-main branches - PREFIX="language=python/date=$DATE" - if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then - PREFIX="test/$PREFIX" + IS_EXPERIMENT="false" + # Set as an experiment if we are not on the main branch or input as an experiment + if [[ "$GH_REF" != "refs/heads/main" || "$IS_EXPERIMENT_INPUT" == "true" ]]; then + IS_EXPERIMENT="true" fi + echo "Uploading metrics: is_experiment=$IS_EXPERIMENT, language=$SDK_LANG, date=$DATE" aws s3 cp omes/$RUN_ID.parquet \ - "s3://cloud-data-ingest-prod/github/sdk_load_test/$PREFIX/$RUN_ID.parquet" + "s3://cloud-data-ingest-prod/github/sdk_load_test/is_experiment=$IS_EXPERIMENT/language=$SDK_LANG/date=$DATE/$RUN_ID.parquet" - name: Upload logs on failure if: failure() || cancelled()