Use pg_collection for dp_cp/expt AG groups; drop parallel_state AG gl… #10922
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: CICD Megatron-LM | |
| on: | |
| schedule: | |
| - cron: 0 0 * * * | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| - "deploy-release/*" | |
| merge_group: | |
| types: [checks_requested] | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com | |
| container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-lm | |
| jobs: | |
| is-not-external-contributor: | |
| runs-on: ubuntu-latest | |
| if: github.repository == 'NVIDIA/Megatron-LM' | |
| outputs: | |
| is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} | |
| is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }} | |
| selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }} | |
| selected_runner_gb200: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-gcp-gpu-x4' || 'ubuntu-latest' }} | |
| permissions: | |
| issues: write | |
| pull-requests: write | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.PAT }} | |
| REPO: ${{ github.repository }} | |
| DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| token: ${{ env.GITHUB_TOKEN }} | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Check NVIDIA SSO membership | |
| id: check-sso | |
| uses: ./.github/actions/check-nvidia-sso-membership | |
| with: | |
| username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} | |
| github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} | |
| sso_users_filename: ${{ vars.SSO_USERS_FILENAME }} | |
| - name: Set maintainer status | |
| id: check-membership | |
| env: | |
| IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} | |
| IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} | |
| SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} | |
| run: | | |
| # Skip SSO check for scheduled jobs, main branch, or merge groups | |
| if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then | |
| echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Use SSO membership check result | |
| IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" | |
| # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo | |
| if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then | |
| PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} | |
| echo "Checking if $PR_AUTHOR is a repo collaborator..." | |
| API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" | |
| REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| $API_URL) | |
| echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." | |
| API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" | |
| ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| $API_URL) | |
| echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." | |
| API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" | |
| ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| $API_URL) | |
| if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then | |
| IS_MEMBER="true" | |
| else | |
| exit 1 | |
| fi | |
| fi | |
| # Use SSO membership check result | |
| if [ "$IS_MEMBER" == "true" ]; then | |
| echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT | |
| else | |
| echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT | |
| fi | |
| pre-flight: | |
| needs: [is-not-external-contributor] | |
| if: github.repository == 'NVIDIA/Megatron-LM' | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 | |
| configure: | |
| runs-on: ubuntu-latest | |
| needs: [pre-flight] | |
| if: github.repository == 'NVIDIA/Megatron-LM' | |
| outputs: | |
| scope: ${{ steps.configure.outputs.scope }} | |
| n_repeat: ${{ steps.configure.outputs.n_repeat }} | |
| lightweight: ${{ steps.configure.outputs.lightweight }} | |
| lts: ${{ steps.configure.outputs.lts }} | |
| mbridge_suite: ${{ steps.configure.outputs.mbridge_suite }} | |
| dev: ${{ steps.configure.outputs.dev }} | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Configure | |
| id: configure | |
| shell: bash -x -e -u -o pipefail {0} | |
| env: | |
| GH_TOKEN: ${{ secrets.PAT }} | |
| IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }} | |
| IS_MERGE_GROUP: ${{ needs.pre-flight.outputs.is_merge_group }} | |
| run: | | |
| PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} | |
| # Fetch all labels in a single API call; fall back to empty list if no PR | |
| LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]') || LABELS='[]' | |
| HAS_RUN_TESTS=$(echo "$LABELS" | jq 'any(. == "Run tests")') | |
| HAS_RUN_FUNCTIONAL=$(echo "$LABELS" | jq 'any(. == "Run functional tests")') | |
| HAS_LTS=$(echo "$LABELS" | jq 'any(. == "container::lts")') | |
| HAS_MBRIDGE=$(echo "$LABELS" | jq 'any(. == "Run MBridge tests")') | |
| # Scheduled/CI workloads have no PR — treat as "Run functional tests" | |
| [ "$IS_CI_WORKLOAD" == "true" ] && HAS_RUN_FUNCTIONAL=true | |
| if [ "$IS_MERGE_GROUP" == "true" ]; then | |
| SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=false | |
| elif [ "$HAS_RUN_TESTS" == "true" ]; then | |
| SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=true | |
| elif [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then | |
| SCOPE=mr-github; N_REPEAT=5; LIGHTWEIGHT=false | |
| else | |
| SCOPE=mr-github-slim; N_REPEAT=5; LIGHTWEIGHT=false | |
| fi | |
| if [ "$HAS_MBRIDGE" == "true" || $IS_MERGE_GROUP == "true" ]; then | |
| MBRIDGE_SUITE="L1" | |
| else | |
| MBRIDGE_SUITE="unit-only" | |
| fi | |
| DEV=true | |
| echo "scope=$SCOPE" | tee -a $GITHUB_OUTPUT | |
| echo "n_repeat=$N_REPEAT" | tee -a $GITHUB_OUTPUT | |
| echo "lightweight=$LIGHTWEIGHT" | tee -a $GITHUB_OUTPUT | |
| echo "lts=$HAS_LTS" | tee -a $GITHUB_OUTPUT | |
| echo "mbridge_suite=$MBRIDGE_SUITE" | tee -a $GITHUB_OUTPUT | |
| echo "dev=$DEV" | tee -a $GITHUB_OUTPUT | |
| # Pre-compute active row markers for the decision tree | |
| _MG=$( [ "$IS_MERGE_GROUP" == "true" ] && echo "**→**" || echo "" ) | |
| _RT=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" == "true" ] && echo "**→**" || echo "" ) | |
| _RF=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" == "true" ] && echo "**→**" || echo "" ) | |
| _DF=$( [ "$SCOPE" == "mr-github-slim" ] && echo "**→**" || echo "" ) | |
| _LTS=$( [ "$HAS_LTS" == "true" ] && echo "**→**" || echo "" ) | |
| _DEV=$( [ "$HAS_LTS" != "true" ] && echo "**→**" || echo "" ) | |
| cat <<SUMMARY >> $GITHUB_STEP_SUMMARY | |
| Beep boop 🤖 I have consulted the labels and decided to run **$SCOPE** $( [ "$LIGHTWEIGHT" == "true" ] && echo "in lightweight mode " || echo "" )against the **$( [ "$HAS_LTS" == "true" ] && echo "lts" || echo "dev" )** container with **$N_REPEAT** repetition(s). You are welcome. | |
| | Setting | Value | | |
| |---|---| | |
| | \`scope\` | \`$SCOPE\` | | |
| | \`n_repeat\` | \`$N_REPEAT\` | | |
| | \`lightweight\` | \`$LIGHTWEIGHT\` | | |
| | \`lts\` | \`$HAS_LTS\` | | |
| | \`dev\` | \`$DEV\` | | |
| | \`mbridge_suite\` | \`$MBRIDGE_SUITE\` | | |
| ### Decision tree | |
| **Test scope** | |
| | | Trigger | \`scope\` | \`n_repeat\` | \`lightweight\` | | |
| |---|---|---|---|---| | |
| | $_MG | Merge group | \`mr-github\` | \`1\` | \`false\` | | |
| | $_RT | Label: _Run tests_ | \`mr-github\` | \`1\` | \`true\` | | |
| | $_RF | Label: _Run functional tests_ / CI workload | \`mr-github\` | \`5\` | \`false\` | | |
| | $_DF | _(default)_ | \`mr-github-slim\` | \`5\` | \`false\` | | |
| **Container image** | |
| | | Trigger | \`image\` | | |
| |---|---|---| | |
| | $_LTS | Label: _container::lts_ | \`lts\` | | |
| | $_DEV | _(default)_ | \`dev\` | | |
| ### Glossary | |
| - **\`lightweight\`**: trains for 4 steps instead of 100 and skips comparison against golden values — faster feedback, no correctness guarantees | |
| - **\`lts\`**: uses the Long Term Support container base image instead of the latest dev image | |
| - **\`dev\`**: uses the latest development container base image (default) | |
| SUMMARY | |
| linting: | |
| runs-on: ubuntu-latest | |
| needs: [pre-flight] | |
| if: | | |
| ( | |
| needs.pre-flight.outputs.is_deployment_workflow == 'false' | |
| && needs.pre-flight.outputs.is_ci_workload == 'true' | |
| ) || ( | |
| needs.pre-flight.outputs.is_deployment_workflow == 'false' | |
| && needs.pre-flight.outputs.is_ci_workload == 'false' | |
| && needs.pre-flight.outputs.docs_only == 'false' | |
| ) | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v1 | |
| with: | |
| version: 0.7.2 | |
| - name: Install linting tools | |
| run: | | |
| uv sync --locked --only-group linting | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Run linting | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| run: | | |
| export PATH=".venv/bin:$PATH" | |
| export GITLAB_ENDPOINT=github.com | |
| export CI_PROJECT_NAMESPACE=NVIDIA | |
| export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" | |
| export CHECK_ONLY=true | |
| export SKIP_DOCS=false | |
| bash tools/autoformat.sh | |
| cicd-wait-in-queue: | |
| runs-on: ubuntu-latest | |
| needs: [pre-flight, linting] | |
| environment: "test" | |
| if: | | |
| !(needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| || needs.pre-flight.outputs.docs_only == 'true') | |
| steps: | |
| - name: Running CI tests | |
| run: | | |
| echo "Running CI tests" | |
| echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}" | |
| cicd-parse-downstream-testing: | |
| runs-on: ubuntu-latest | |
| needs: | |
| - pre-flight | |
| - configure | |
| - cicd-wait-in-queue | |
| if: | | |
| needs.pre-flight.result != 'cancelled' | |
| && needs.configure.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| outputs: | |
| mbridge-test-suite: ${{ needs.configure.outputs.mbridge_suite }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: How-To | |
| run: bash .github/scripts/readme.sh | |
| cicd-mbridge-testing: | |
| runs-on: ubuntu-latest | |
| needs: | |
| - pre-flight | |
| - cicd-wait-in-queue | |
| - cicd-parse-downstream-testing | |
| if: | | |
| needs.pre-flight.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-parse-downstream-testing.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Checkout MBridge and create testing branch | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: main | |
| repository: NVIDIA-NeMo/Megatron-Bridge | |
| path: megatron-bridge | |
| token: ${{ secrets.PAT }} | |
| - name: Create testing branch | |
| env: | |
| MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} | |
| run: | | |
| cd megatron-bridge | |
| git fetch origin main | |
| git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main | |
| git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force | |
| - name: Get merge commit sha | |
| shell: bash -x -e -u -o pipefail {0} | |
| id: sha | |
| env: | |
| IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} | |
| IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} | |
| run: | | |
| if [[ "$IS_PR" == "true" ]]; then | |
| SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} | |
| elif [[ "$IS_MERGE_GROUP" == "true" ]]; then | |
| SHA=${{ github.event.merge_group.head_sha }} | |
| else | |
| SHA=${GITHUB_SHA} | |
| fi | |
| echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" | |
| - name: Trigger MBridge tests | |
| uses: convictional/trigger-workflow-and-wait@v1.6.5 | |
| env: | |
| MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} | |
| with: | |
| owner: NVIDIA-NeMo | |
| repo: Megatron-Bridge | |
| workflow_file_name: cicd-main.yml | |
| github_token: ${{ secrets.PAT }} | |
| ref: ${{ env.MBRIDGE_BRANCH_NAME }} | |
| wait_interval: 60 | |
| propagate_failure: true | |
| client_payload: | | |
| { | |
| "mcore_ref": "${{ steps.sha.outputs.main }}", | |
| "test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}", | |
| "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| - name: Delete testing branch | |
| if: always() | |
| env: | |
| MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }} | |
| run: | | |
| cd megatron-bridge | |
| git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }} | |
| cicd-compute-build-matrix: | |
| runs-on: ubuntu-latest | |
| needs: [is-not-external-contributor] | |
| outputs: | |
| matrix: ${{ steps.compute.outputs.matrix }} | |
| steps: | |
| - name: Compute build matrix | |
| id: compute | |
| env: | |
| IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }} | |
| SELECTED_RUNNER: ${{ needs.is-not-external-contributor.outputs.selected_runner }} | |
| SELECTED_RUNNER_GB200: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }} | |
| REGISTRY_AWS: ${{ env.container-registry }} | |
| REGISTRY_GCP: ${{ env.container-registry-gb200 }} | |
| run: | | |
| AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "$SELECTED_RUNNER" \ | |
| '{"cloud": "aws", "registry": $registry, "runner": $runner}') | |
| if [ "$IS_MAINTAINER" == "true" ]; then | |
| GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "$SELECTED_RUNNER_GB200" \ | |
| '{"cloud": "gcp", "registry": $registry, "runner": $runner}') | |
| MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \ | |
| '{"include": [$aws, $gcp]}') | |
| else | |
| MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}') | |
| fi | |
| echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT" | |
| cicd-container-build: | |
| needs: [is-not-external-contributor, pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix] | |
| strategy: | |
| fail-fast: false | |
| matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }} | |
| runs-on: ${{ matrix.runner }} | |
| if: | | |
| needs.is-not-external-contributor.result != 'cancelled' | |
| && needs.pre-flight.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-compute-build-matrix.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Get PR info | |
| id: get-pr-info | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push' | |
| uses: nv-gha-runners/get-pr-info@main | |
| - name: Get merge commit sha | |
| shell: bash -x -e -u -o pipefail {0} | |
| id: sha | |
| env: | |
| IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} | |
| IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} | |
| run: | | |
| if [[ "$IS_PR" == "true" ]]; then | |
| SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} | |
| elif [[ "$IS_MERGE_GROUP" == "true" ]]; then | |
| SHA=${{ github.event.merge_group.head_sha }} | |
| else | |
| SHA=${GITHUB_SHA} | |
| fi | |
| echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT" | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ steps.sha.outputs.main }} | |
| - name: Setup python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: 3.12 | |
| - name: Install GH CLI | |
| shell: bash -x -e -u -o pipefail {0} | |
| run: | | |
| apt-get update | |
| apt-get install -y gh | |
| - name: Download test data | |
| shell: bash | |
| run: | | |
| echo "::group::Download test data" | |
| pip install --no-cache-dir click requests | |
| python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets | |
| echo "::endgroup::" | |
| - name: Install GH CLI | |
| shell: bash | |
| run: | | |
| apt-get update | |
| apt-get install -y gh | |
| - name: Get last merged PR | |
| id: cache_from | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| LAST_PRS=$(gh api graphql -f query=' | |
| query { | |
| repository(owner: "NVIDIA", name: "Megatron-LM") { | |
| pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { | |
| nodes { | |
| number | |
| } | |
| } | |
| } | |
| }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do | |
| echo "type=registry,ref=${{ matrix.registry }}/megatron-lm:$number-buildcache,mode=max" | |
| done) | |
| echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT | |
| echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT | |
| echo "EOF" | tee -a $GITHUB_OUTPUT | |
| - name: Parse baseimage | |
| shell: bash | |
| id: base-image | |
| env: | |
| HAS_LTS_LABEL: ${{ needs.configure.outputs.lts }} | |
| run: | | |
| if [ "$HAS_LTS_LABEL" == "true" ]; then | |
| NGC_VERSION=$(cat docker/.ngc_version.lts) | |
| echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT | |
| echo "image_type=lts" | tee -a $GITHUB_OUTPUT | |
| else | |
| NGC_VERSION=$(cat docker/.ngc_version.dev) | |
| echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT | |
| echo "image_type=dev" | tee -a $GITHUB_OUTPUT | |
| fi | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Build and push | |
| uses: docker/build-push-action@v6 | |
| with: | |
| file: ./docker/Dockerfile.ci.dev | |
| push: true | |
| context: . | |
| target: main | |
| build-args: | | |
| FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }} | |
| IMAGE_TYPE=${{ steps.base-image.outputs.image_type }} | |
| cache-from: | | |
| type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max | |
| type=registry,ref=${{ matrix.registry }}/megatron-lm:main-buildcache,mode=max | |
| ${{ steps.cache_from.outputs.LAST_PRS }} | |
| cache-to: | | |
| type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max | |
| no-cache: false | |
| tags: | | |
| ${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }} | |
| ${{ matrix.registry }}/megatron-lm:${{ github.sha }} | |
| secrets: | | |
| GH_TOKEN=${{ secrets.PAT }} | |
| cicd-parse-unit-tests: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }} | |
| needs: | |
| - pre-flight | |
| - cicd-wait-in-queue | |
| - cicd-container-build | |
| if: | | |
| needs.pre-flight.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-container-build.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Parse unit tests | |
| id: parse-unit-tests | |
| run: | | |
| cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json | |
| echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT | |
| cicd-unit-tests-latest: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }} | |
| needs: | |
| - is-not-external-contributor | |
| - pre-flight | |
| - cicd-wait-in-queue | |
| - cicd-container-build | |
| - cicd-parse-unit-tests | |
| runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} | |
| timeout-minutes: 60 | |
| name: "${{ matrix.bucket }} - latest" | |
| if: | | |
| needs.is-not-external-contributor.result != 'cancelled' | |
| && needs.pre-flight.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-container-build.result != 'cancelled' | |
| && needs.cicd-parse-unit-tests.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| env: | |
| PIP_DISABLE_PIP_VERSION_CHECK: 1 | |
| PIP_NO_PYTHON_VERSION_WARNING: 1 | |
| PIP_ROOT_USER_ACTION: ignore | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: main | |
| uses: ./.github/actions | |
| with: | |
| test_case: ${{ matrix.bucket }} | |
| tag: latest | |
| timeout: ${{ matrix.timeout || 30 }} | |
| is_unit_test: "true" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} | |
| cicd-parse-integration-tests-h100: | |
| runs-on: ubuntu-latest | |
| needs: | |
| - pre-flight | |
| - configure | |
| - cicd-wait-in-queue | |
| - cicd-container-build | |
| - cicd-unit-tests-latest | |
| if: | | |
| needs.pre-flight.result != 'cancelled' | |
| && needs.configure.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-container-build.result != 'cancelled' | |
| && needs.cicd-unit-tests-latest.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| outputs: | |
| integration-tests-h100: ${{ steps.main.outputs.integration-tests-h100 }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Parse functional tests | |
| id: main | |
| env: | |
| SCOPE: ${{ needs.configure.outputs.scope }} | |
| LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }} | |
| run: | | |
| export PYTHONPATH=$(pwd) | |
| ARGS=(--scope $SCOPE) | |
| [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode) | |
| python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ | |
| --n-repeat 5 \ | |
| --time-limit 2700 \ | |
| --test-cases all \ | |
| --container-image mcore_ci_dev \ | |
| --container-tag latest \ | |
| --dependent-job functional:configure \ | |
| --record-checkpoints false \ | |
| --slurm-account gh \ | |
| --no-enable-warmup \ | |
| --environment dev \ | |
| --platform dgx_h100 \ | |
| --cluster ghci \ | |
| ${ARGS[@]} \ | |
| --output-path integration-tests-h100.yaml | |
| cat integration-tests-h100.yaml | \ | |
| yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-h100.json | |
| echo "integration-tests-h100=$(cat integration-tests-h100.json)" | tee -a "$GITHUB_OUTPUT" | |
| cicd-integration-tests-latest-h100: | |
| timeout-minutes: 60 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }} | |
| needs: | |
| - is-not-external-contributor | |
| - pre-flight | |
| - configure | |
| - cicd-wait-in-queue | |
| - cicd-parse-integration-tests-h100 | |
| - cicd-unit-tests-latest | |
| runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} | |
| name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" | |
| env: | |
| PIP_DISABLE_PIP_VERSION_CHECK: 1 | |
| PIP_NO_PYTHON_VERSION_WARNING: 1 | |
| PIP_ROOT_USER_ACTION: ignore | |
| if: | | |
| needs.is-not-external-contributor.result != 'cancelled' | |
| && needs.pre-flight.result != 'cancelled' | |
| && needs.configure.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-parse-integration-tests-h100.result != 'cancelled' | |
| && needs.cicd-unit-tests-latest.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: main | |
| uses: ./.github/actions | |
| with: | |
| test_case: ${{ matrix.test_case }} | |
| model: ${{ matrix.model }} | |
| tag: latest | |
| timeout: ${{ matrix.timeout || 30 }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} | |
| scope: ${{ needs.configure.outputs.scope }} | |
| n_repeat: ${{ needs.configure.outputs.n_repeat }} | |
| lightweight: ${{ needs.configure.outputs.lightweight }} | |
| cicd-parse-integration-tests-gb200: | |
| runs-on: ubuntu-latest | |
| needs: | |
| - is-not-external-contributor | |
| - pre-flight | |
| - configure | |
| - cicd-wait-in-queue | |
| - cicd-container-build | |
| - cicd-unit-tests-latest | |
| if: | | |
| needs.is-not-external-contributor.outputs.is_maintainer == 'true' | |
| && needs.pre-flight.result != 'cancelled' | |
| && needs.configure.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-container-build.result != 'cancelled' | |
| && needs.cicd-unit-tests-latest.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| outputs: | |
| integration-tests-gb200: ${{ steps.main.outputs.integration-tests-gb200 }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Parse functional tests | |
| id: main | |
| env: | |
| SCOPE: ${{ needs.configure.outputs.scope }} | |
| LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }} | |
| run: | | |
| export PYTHONPATH=$(pwd) | |
| ARGS=(--scope $SCOPE) | |
| [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode) | |
| python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ | |
| --n-repeat 5 \ | |
| --time-limit 2700 \ | |
| --test-cases all \ | |
| --container-image mcore_ci_dev \ | |
| --container-tag latest \ | |
| --dependent-job functional:configure \ | |
| --record-checkpoints false \ | |
| --slurm-account gh \ | |
| --no-enable-warmup \ | |
| --environment dev \ | |
| --platform dgx_gb200 \ | |
| --cluster dgxgb200_oci-hsg \ | |
| ${ARGS[@]} \ | |
| --output-path integration-tests-gb200.yaml | |
| cat integration-tests-gb200.yaml | \ | |
| yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests-gb200.json | |
| echo "integration-tests-gb200=$(cat integration-tests-gb200.json)" | tee -a "$GITHUB_OUTPUT" | |
| cicd-integration-tests-latest-gb200: | |
| timeout-minutes: 60 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }} | |
| needs: | |
| - is-not-external-contributor | |
| - pre-flight | |
| - configure | |
| - cicd-wait-in-queue | |
| - cicd-parse-integration-tests-gb200 | |
| - cicd-unit-tests-latest | |
| runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }} | |
| name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" | |
| env: | |
| PIP_DISABLE_PIP_VERSION_CHECK: 1 | |
| PIP_NO_PYTHON_VERSION_WARNING: 1 | |
| PIP_ROOT_USER_ACTION: ignore | |
| if: | | |
| needs.is-not-external-contributor.outputs.is_maintainer == 'true' | |
| && needs.is-not-external-contributor.result != 'cancelled' | |
| && needs.pre-flight.result != 'cancelled' | |
| && needs.configure.result != 'cancelled' | |
| && needs.cicd-wait-in-queue.result != 'cancelled' | |
| && needs.cicd-parse-integration-tests-gb200.result != 'cancelled' | |
| && needs.cicd-unit-tests-latest.result != 'cancelled' | |
| && ( | |
| success() | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.force_run_all == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: main | |
| uses: ./.github/actions | |
| with: | |
| test_case: ${{ matrix.test_case }} | |
| model: ${{ matrix.model }} | |
| tag: latest | |
| timeout: ${{ matrix.timeout || 30 }} | |
| is_unit_test: "false" | |
| PAT: ${{ secrets.PAT }} | |
| container-image: ${{ env.container-registry-gb200 }}/megatron-lm:${{ github.sha }} | |
| scope: ${{ needs.configure.outputs.scope }} | |
| n_repeat: ${{ needs.configure.outputs.n_repeat }} | |
| lightweight: ${{ needs.configure.outputs.lightweight }} | |
| platform: dgx_gb200 | |
| Nemo_CICD_Test: | |
| needs: | |
| - pre-flight | |
| - is-not-external-contributor | |
| - cicd-unit-tests-latest | |
| - cicd-integration-tests-latest-h100 | |
| - cicd-integration-tests-latest-gb200 | |
| if: | | |
| ( | |
| needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || needs.pre-flight.outputs.is_ci_workload == 'true' | |
| || needs.pre-flight.outputs.is_merge_group == 'true' | |
| || always() | |
| ) | |
| && !cancelled() | |
| && github.repository == 'NVIDIA/Megatron-LM' | |
| runs-on: ubuntu-latest | |
| permissions: write-all | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Get workflow result | |
| id: result | |
| shell: bash -x -e -u -o pipefail {0} | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| GITHUB_RUN_ID: ${{ github.run_id }} | |
| DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }} | |
| IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }} | |
| IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }} | |
| UNIT_RESULT: ${{ needs.cicd-unit-tests-latest.result }} | |
| H100_RESULT: ${{ needs.cicd-integration-tests-latest-h100.result }} | |
| GB200_RESULT: ${{ needs.cicd-integration-tests-latest-gb200.result }} | |
| run: | | |
| # Docs-only and deployment workflows intentionally skip all tests | |
| if [ "$DOCS_ONLY" == "true" ] || [ "$IS_DEPLOYMENT" == "true" ]; then | |
| echo "✅ Docs-only or deployment workflow — test checks skipped" | |
| exit 0 | |
| fi | |
| FAILED=false | |
| # Unit tests must always succeed (never skipped or cancelled) | |
| if [ "$UNIT_RESULT" != "success" ]; then | |
| echo "❌ cicd-unit-tests-latest: $UNIT_RESULT" | |
| FAILED=true | |
| fi | |
| # H100 integration tests must always succeed | |
| if [ "$H100_RESULT" != "success" ]; then | |
| echo "❌ cicd-integration-tests-latest-h100: $H100_RESULT" | |
| FAILED=true | |
| fi | |
| # GB200 integration tests may be skipped only for non-maintainer PRs | |
| # (no GB200 runners available); maintainer runs must always succeed | |
| if [ "$GB200_RESULT" == "skipped" ] && [ "$IS_MAINTAINER" == "true" ]; then | |
| echo "❌ cicd-integration-tests-latest-gb200: skipped unexpectedly for a maintainer run" | |
| FAILED=true | |
| elif [ "$GB200_RESULT" != "success" ] && [ "$GB200_RESULT" != "skipped" ]; then | |
| echo "❌ cicd-integration-tests-latest-gb200: $GB200_RESULT" | |
| FAILED=true | |
| fi | |
| # Broad scan: catch any individual job failures or cancellations | |
| # (e.g. a single matrix instance cancelled mid-run) | |
| BAD_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq ' | |
| [.jobs[] | select( | |
| .status == "completed" | |
| and (.conclusion == "failure" or .conclusion == "cancelled") | |
| and .name != "merge-queue-notification" | |
| and .name != "cicd-mbridge-testing" | |
| )] | length | |
| ') || BAD_JOBS=0 | |
| if [ "${BAD_JOBS:-0}" -gt 0 ]; then | |
| echo "❌ Found ${BAD_JOBS} failed or cancelled job(s):" | |
| gh run view $GITHUB_RUN_ID --json jobs --jq ' | |
| .jobs[] | select( | |
| .status == "completed" | |
| and (.conclusion == "failure" or .conclusion == "cancelled") | |
| and .name != "merge-queue-notification" | |
| and .name != "cicd-mbridge-testing" | |
| ) | .name + " → " + .conclusion | |
| ' | |
| FAILED=true | |
| fi | |
| if [ "$FAILED" != "true" ]; then | |
| echo "✅ All previous jobs completed successfully" | |
| else | |
| exit 1 | |
| fi | |
| Coverage_Fake: | |
| runs-on: ubuntu-latest | |
| needs: [Nemo_CICD_Test, pre-flight] | |
| if: | | |
| ( | |
| needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || github.event == 'merge_group' | |
| ) | |
| && needs.pre-flight.outputs.is_ci_workload == 'false' | |
| && !cancelled() | |
| && github.repository == 'NVIDIA/Megatron-LM' | |
| steps: | |
| - name: Generate fake coverage report | |
| uses: actions/github-script@v8 | |
| with: | |
| github-token: ${{ secrets.PAT }} | |
| script: | | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: context.sha, | |
| state: 'success', | |
| description: 'No code changes - coverage check skipped', | |
| context: 'codecov/patch' | |
| }); | |
| Coverage: | |
| runs-on: ubuntu-latest | |
| needs: [Nemo_CICD_Test] | |
| if: | | |
| ( | |
| (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure()) | |
| || success() | |
| ) | |
| && !cancelled() | |
| && github.repository == 'NVIDIA/Megatron-LM' | |
| strategy: | |
| matrix: | |
| flag: [unit-test] | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Download coverage reports of current branch | |
| uses: actions/download-artifact@v7 | |
| with: | |
| pattern: coverage-${{ matrix.flag }}-* | |
| - name: List coverage files | |
| run: find . -type f -name "*.xml" -o -name "*.lcov" | |
| - name: Get total coverage of current branch | |
| shell: bash -x -e -u -o pipefail {0} | |
| if: always() | |
| run: | | |
| pip install coverage | |
| ls -al . | |
| ls -al coverage-*/ | |
| coverage combine --keep $(ls coverage-*/.coverage) | |
| coverage report -i | |
| rm -rf coverage-* | |
| ls -al | |
| - name: Upload coverage reports to Codecov | |
| uses: codecov/codecov-action@v5 | |
| with: | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| verbose: true | |
| flags: ${{ matrix.flag }} | |
| - name: Upload artifacts | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: coverage-${{ matrix.flag }}-aggregated | |
| path: | | |
| .coverage | |
| include-hidden-files: true | |
| merge-queue-notification: | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'merge_group' | |
| permissions: | |
| pull-requests: write | |
| steps: | |
| - name: Extract PR number from merge group | |
| id: get-pr-number | |
| run: | | |
| # Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr-<number>-<sha>) | |
| PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p') | |
| echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT | |
| - name: Comment on PR with action run URL | |
| uses: actions/github-script@v8 | |
| with: | |
| github-token: ${{ secrets.PAT }} | |
| script: | | |
| const prNumber = ${{ steps.get-pr-number.outputs.pr_number }}; | |
| const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}` | |
| }); | |
| cleanup-taint-node: | |
| runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }} | |
| needs: | |
| - is-not-external-contributor | |
| - cicd-container-build | |
| - cicd-unit-tests-latest | |
| - cicd-integration-tests-latest-h100 | |
| - cicd-integration-tests-latest-gb200 | |
| - Coverage | |
| - Coverage_Fake | |
| if: | | |
| always() | |
| && !cancelled() | |
| && contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral') | |
| && !needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| steps: | |
| - name: Taint node for cleanup | |
| shell: bash | |
| run: taint-node.sh |