refactor: refactor loss function #10273
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: "CICD NeMo RL" | |
| on: | |
| pull_request: | |
| branches: | |
| - "main" | |
| - "r**" | |
| types: [labeled, opened, synchronize, reopened] | |
| merge_group: | |
| types: [checks_requested] | |
| schedule: | |
| - cron: "0 9 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| test_to_run: | |
| required: false | |
| default: L2 | |
| type: choice | |
| options: | |
| - docs | |
| - Lfast | |
| - L0 | |
| - L1 | |
| - L2 | |
| description: Test level to run. docs = doc tests only, Lfast = fast subset (reuses main container), L0 = unit/docs/lint, L1 = L0 + functional, L2 = L1 + convergence | |
| image_tag: | |
| description: "Override container image tag (e.g. 'main'). Skips container build." | |
| required: false | |
| default: "" | |
| # TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge | |
| #push: | |
| # branches: | |
| # - 'main' | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }} | |
| cancel-in-progress: true | |
| jobs: | |
| pre-flight: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| test_level: ${{ steps.evaluate.outputs.test_level }} | |
| image_tag: ${{ steps.evaluate.outputs.image_tag }} | |
| steps: | |
| - name: Get changed files | |
| id: changed-files | |
| if: github.event_name == 'pull_request' | |
| uses: step-security/changed-files@v45.0.1 | |
| with: | |
| files_yaml: | | |
| doc: | |
| - '**.md' | |
| - docs/** | |
| src: | |
| - '!**.md' | |
| - '!docs/**' | |
| - name: Evaluate conditions | |
| id: evaluate | |
| env: | |
| DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }} | |
| CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }} | |
| CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }} | |
| IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }} | |
| LABEL: ${{ github.event.label.name }} | |
| MERGE_GROUP: ${{ github.event_name == 'merge_group' }} | |
| run: | | |
| # Some output that's helpful for debugging | |
| echo "Docs changed: $CHANGED_DOCS" | |
| echo "Src changed: $CHANGED_SRC" | |
| echo "LABEL: $LABEL" | |
| echo "IS_PULLREQUEST: $IS_PULLREQUEST" | |
| echo "DOCS_ONLY: $DOCS_ONLY" | |
| # Run CI only (on main or if label is attached) and if it's not only docs | |
| # Determine test level based on conditions | |
| if [[ "$DOCS_ONLY" == "true" || "$LABEL" == "CI:docs" ]]; then | |
| # For doc-only changes, run only doc tests | |
| TEST_LEVEL="docs" | |
| elif [[ "$LABEL" == "CI:Lfast" ]]; then | |
| TEST_LEVEL="Lfast" | |
| elif [[ "$LABEL" == "CI:L0" ]]; then | |
| TEST_LEVEL="L0" | |
| elif [[ "$LABEL" == "CI:L1" || "$IS_PULLREQUEST" == "false" || "$MERGE_GROUP" == "true" ]]; then | |
| # For labeled PRs, pushes to main (IS_PULL_REQUEST=false), or merge group events, run L1 by default | |
| TEST_LEVEL="L1" | |
| elif [[ "$LABEL" == "CI:L2" ]]; then | |
| TEST_LEVEL="L2" | |
| else | |
| # Skip tests by default for non-labeled PRs | |
| TEST_LEVEL="none" | |
| fi | |
| if [[ "${{ github.event_name }}" == "schedule" ]]; then | |
| echo "Setting test level to L1 for nightly scheduled run" | |
| TEST_LEVEL="L1" | |
| fi | |
| # Override test level if specified in workflow_dispatch | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| echo "Overriding test level from $TEST_LEVEL to ${{ inputs.test_to_run }}" | |
| TEST_LEVEL="${{ inputs.test_to_run }}" | |
| fi | |
| echo "test_level=$TEST_LEVEL" | tee -a "$GITHUB_OUTPUT" | |
| # Determine image tag: Lfast uses main, workflow_dispatch can override | |
| IMAGE_TAG="" | |
| if [[ "$TEST_LEVEL" == "Lfast" ]]; then | |
| IMAGE_TAG="main" | |
| fi | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ inputs.image_tag }}" ]]; then | |
| IMAGE_TAG="${{ inputs.image_tag }}" | |
| fi | |
| echo "image_tag=$IMAGE_TAG" | tee -a "$GITHUB_OUTPUT" | |
| pr-branch-up-to-date-check: | |
| name: Check if PR branch is up to date | |
| needs: [pre-flight] | |
| if: ${{ github.event_name == 'pull_request' }} | |
| runs-on: ubuntu-latest | |
| env: | |
| MAX_COMMITS_BEHIND: 10 | |
| steps: | |
| - name: Check how many commits behind target branch | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| REPO: ${{ github.repository }} | |
| BASE_SHA: ${{ github.event.pull_request.base.sha }} | |
| HEAD_SHA: ${{ github.event.pull_request.head.sha }} | |
| BASE_REF: ${{ github.base_ref }} | |
| HEAD_LABEL: ${{ github.event.pull_request.head.label }} | |
| run: | | |
| echo "Repository: $REPO" | |
| echo "Base branch: $BASE_REF (SHA: $BASE_SHA)" | |
| echo "PR head: $HEAD_LABEL (SHA: $HEAD_SHA)" | |
| echo "Maximum commits behind allowed: $MAX_COMMITS_BEHIND" | |
| API_RESPONSE=$(gh api "repos/$REPO/compare/$HEAD_SHA...$BASE_REF" --jq '{behind_by: .behind_by, ahead_by: .ahead_by, status: .status}') | |
| COMMITS_BEHIND=$(echo "$API_RESPONSE" | jq -r '.ahead_by') | |
| COMMITS_AHEAD=$(echo "$API_RESPONSE" | jq -r '.behind_by') | |
| STATUS=$(echo "$API_RESPONSE" | jq -r '.status') | |
| echo "Comparison status: $STATUS" | |
| echo "PR is $COMMITS_BEHIND commits behind and $COMMITS_AHEAD commits ahead of $BASE_REF" | |
| # Check if we're behind by more than the allowed number | |
| if [ "$COMMITS_BEHIND" -gt "$MAX_COMMITS_BEHIND" ]; then | |
| echo "❌ ERROR: This PR is $COMMITS_BEHIND commits behind $BASE_REF, which exceeds the maximum allowed ($MAX_COMMITS_BEHIND commits)." | |
| echo "Please rebase or merge the latest changes from $BASE_REF into your PR branch." | |
| exit 1 | |
| else | |
| echo "✅ PR is acceptably fresh ($COMMITS_BEHIND commits behind, limit is $MAX_COMMITS_BEHIND)" | |
| fi | |
| lint-check: | |
| name: Lint check | |
| needs: [pre-flight] | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Free up disk space | |
| run: | | |
| # Remove unnecessary packages and files on Ubuntu | |
| sudo apt-get clean | |
| sudo rm -rf /usr/local/lib/android || true | |
| sudo rm -rf /opt/ghc || true | |
| sudo rm -rf /usr/local/.ghcup || true | |
| sudo rm -rf /usr/share/dotnet || true | |
| sudo rm -rf /opt/az || true | |
| # Clear pip and npm caches | |
| pip cache purge || true | |
| sudo npm cache clean --force || true | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: 'recursive' | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v5 | |
| with: | |
| version: "0.9.1" | |
| enable-cache: true | |
| prune-cache: false | |
| # Faster than uv python install since it caches python alongside runner | |
| - name: "Set up Python" | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version-file: ".python-version" | |
| - name: Check lint | |
| run: | | |
| uv venv | |
| uv run --group dev pre-commit install | |
| uv run --group dev pre-commit run --all-files --show-diff-on-failure --color=always | |
| # TODO: this is a temporary check and should be removed once we have 100% correctness | |
| - name: Check if any files with zero errors not in whitelist | |
| run: | | |
| missing_count=0 | |
| for file in $(uv run --group dev pyrefly check $(git ls-files 'nemo_rl/**/*.py' 'examples/**/*.py' 'docs/*.py' 'tools/**/*.py') --output-format json | jq -r --slurpfile all_files <(git ls-files 'nemo_rl/**/*.py' 'examples/**/*.py' 'docs/*.py' 'tools/**/*.py' | jq -R -s 'split("\n")[:-1]') --arg pwd "$(pwd)/" '(.errors | group_by(.path) | map({(.[0].path | sub($pwd; "")): length}) | add // {}) as $error_counts | $all_files[0][] | . as $file | if ($error_counts[$file] // 0) == 0 then $file else empty end'); do | |
| if ! fgrep -q "$file" pyrefly.toml; then | |
| echo "File $file has zero errors but is not in pyrefly.toml in the 'project-includes' list. Please add it to this whitelist." | |
| ((missing_count++)) | |
| fi | |
| done | |
| exit $missing_count | |
| - name: Minimize uv cache | |
| run: uv cache prune --ci | |
| sphinx-build: | |
| needs: [pre-flight] | |
| if: ${{ needs.pre-flight.outputs.test_level != 'none' }} | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0 | |
| build-container: | |
| if: ${{ needs.pre-flight.outputs.test_level != 'none' && needs.pre-flight.outputs.image_tag == '' }} | |
| needs: [pre-flight] | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0 | |
| with: | |
| build-ref: ${{ github.sha }} | |
| image-name: nemo_rl_container | |
| dockerfile: docker/Dockerfile | |
| image-label: nemo-rl | |
| target: release | |
| build-contexts: | | |
| nemo-rl=${{ github.run_id }}/ | |
| build-args: | | |
| MAX_JOBS=4 | |
| NEMO_RL_COMMIT=${{ github.sha }} | |
| cicd-doc-tests: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - script: Docs_Tests | |
| runner: self-hosted-azure | |
| needs: [pre-flight, build-container] | |
| if: ${{ contains('docs L0 L1 L2', needs.pre-flight.outputs.test_level) }} | |
| runs-on: ${{ matrix.runner }} | |
| name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} | |
| environment: nemo-ci | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| with: | |
| runner: ${{ runner.name }} | |
| script: ${{ matrix.script }} | |
| is_doc_test: "true" | |
| is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }} | |
| cicd-unit-tests: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - script: L0_Unit_Tests_Generation | |
| runner: self-hosted-azure | |
| - script: L0_Unit_Tests_Policy | |
| runner: self-hosted-azure | |
| - script: L0_Unit_Tests_Other | |
| runner: self-hosted-azure | |
| needs: [pre-flight, build-container, cicd-doc-tests] | |
| if: >- | |
| ${{ | |
| always() && | |
| contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) && | |
| needs.pre-flight.result == 'success' && | |
| (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') && | |
| (needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped') | |
| }} | |
| runs-on: ${{ matrix.runner }} | |
| name: ${{ matrix.script }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| with: | |
| runner: ${{ runner.name }} | |
| script: ${{ matrix.script }} | |
| image-tag: ${{ needs.pre-flight.outputs.image_tag }} | |
| is_unit_test: "true" | |
| cpu-only: ${{ matrix.cpu-only || false }} | |
| is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }} | |
| cicd-functional-tests: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - script: L1_Functional_Tests_GPU | |
| runner: self-hosted-azure | |
| needs: [pre-flight, build-container, cicd-unit-tests] | |
| runs-on: ${{ matrix.runner }} | |
| if: ${{ contains('L1 L2', needs.pre-flight.outputs.test_level) }} | |
| name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} | |
| environment: nemo-ci | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| with: | |
| runner: ${{ runner.name }} | |
| script: ${{ matrix.script }} | |
| is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }} | |
| cicd-fast-functional-tests: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - script: L1_Functional_Tests_GPU | |
| runner: self-hosted-azure | |
| needs: [pre-flight] | |
| if: ${{ needs.pre-flight.outputs.test_level == 'Lfast' }} | |
| runs-on: ${{ matrix.runner }} | |
| name: fast_${{ matrix.script }} | |
| environment: nemo-ci | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: main | |
| uses: ./.github/actions/test-template | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| with: | |
| runner: ${{ runner.name }} | |
| script: ${{ matrix.script }} | |
| image-tag: ${{ needs.pre-flight.outputs.image_tag }} | |
| is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }} | |
| CI_QA_Gate: | |
| name: "CI quality check${{ needs.pre-flight.outputs.test_level == 'none' && ' (No tests run: Label CI:L*)' || '' }}" | |
| if: always() | |
| runs-on: ubuntu-latest | |
| needs: | |
| - pre-flight | |
| - pr-branch-up-to-date-check | |
| - lint-check | |
| - sphinx-build | |
| - build-container | |
| - cicd-doc-tests | |
| - cicd-unit-tests | |
| - cicd-functional-tests | |
| - cicd-fast-functional-tests | |
| steps: | |
| - name: main | |
| env: | |
| JOB_RESULTS: ${{ toJSON(needs) }} | |
| # Job is considered successful if nothing was run, or if all jobs were successful (the tests run even if only docs were run b/c doctests are selected) | |
| ALL_SUCCESS: >- | |
| ${{ | |
| needs.lint-check.result == 'success' && | |
| (needs.pr-branch-up-to-date-check.result == 'success' || needs.pr-branch-up-to-date-check.result == 'skipped') && | |
| ( | |
| needs.pre-flight.outputs.test_level != 'none' && | |
| needs.sphinx-build.result == 'success' && | |
| (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') && | |
| ( | |
| ( | |
| (needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped') && | |
| (needs.cicd-unit-tests.result == 'skipped' || needs.cicd-unit-tests.result == 'success') && | |
| (needs.cicd-functional-tests.result == 'skipped' || needs.cicd-functional-tests.result == 'success') && | |
| (needs.cicd-fast-functional-tests.result == 'skipped' || needs.cicd-fast-functional-tests.result == 'success') | |
| ) | |
| ) | |
| ) | |
| }} | |
| CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }} | |
| TEST_LEVEL: ${{ needs.pre-flight.outputs.test_level }} | |
| run: | | |
| SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"') | |
| echo '🤖: CICD Result for test level: ${{ needs.pre-flight.outputs.test_level }}' >> $GITHUB_STEP_SUMMARY | |
| echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY | |
| test "$ALL_SUCCESS" = "true" || test "$CI_SKIP" = "true" | |
| notify-nightly-failure: | |
| name: Notify nightly test failure | |
| runs-on: ubuntu-latest | |
| needs: [CI_QA_Gate] | |
| environment: main | |
| if: ${{ always() && github.event_name == 'schedule' && needs.CI_QA_Gate.result == 'failure' }} | |
| steps: | |
| - name: Send Slack notification | |
| env: | |
| SLACK_WEBHOOK: ${{ secrets.SLACK_TEAM_CHANNEL_WEBHOOK }} | |
| run: | | |
| MESSAGE='{ | |
| "blocks": [ | |
| { | |
| "type": "section", | |
| "text": { | |
| "type": "mrkdwn", | |
| "text": "🚨 Nightly GitHub CI test failed on main branch\n\n• Repository: ${{ github.repository }}\n• Commit: `${{ github.sha }}`\n• Workflow: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Run>" | |
| } | |
| } | |
| ] | |
| }' | |
| curl -X POST -H "Content-type: application/json" --data "$MESSAGE" "$SLACK_WEBHOOK" | |
| Coverage: | |
| runs-on: ubuntu-latest | |
| needs: | |
| - CI_QA_Gate | |
| - cicd-doc-tests | |
| - cicd-unit-tests | |
| - cicd-functional-tests | |
| if: always() | |
| strategy: | |
| matrix: | |
| flag: [doc-test, unit-test, e2e] | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Download coverage reports of current branch | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: coverage-${{ matrix.flag }}-* | |
| - name: Check if artifacts were downloaded | |
| id: check-artifacts | |
| run: | | |
| # Check if any coverage directories were downloaded | |
| if ls coverage-* 1> /dev/null 2>&1; then | |
| echo "artifacts-found=true" >> $GITHUB_OUTPUT | |
| echo "Found coverage artifacts for ${{ matrix.flag }}" | |
| else | |
| echo "artifacts-found=false" >> $GITHUB_OUTPUT | |
| echo "No coverage artifacts found for ${{ matrix.flag }}" | |
| fi | |
| - name: Get total coverage of current branch | |
| shell: bash -x -e -u -o pipefail {0} | |
| if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }} | |
| run: | | |
| pip install coverage | |
| ls -al . | |
| ls -al coverage-*/ | |
| coverage combine --keep $(ls coverage-*/.coverage) | |
| coverage report -i --show-missing | |
| rm -rf coverage-* | |
| ls -al | |
| - name: Skip coverage processing | |
| if: ${{ steps.check-artifacts.outputs.artifacts-found == 'false' }} | |
| run: | | |
| echo "No coverage artifacts found for ${{ matrix.flag }}, skipping coverage processing" | |
| - name: Upload coverage reports to Codecov | |
| if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }} | |
| uses: codecov/codecov-action@v5 | |
| with: | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| verbose: true | |
| flags: ${{ matrix.flag }} | |
| - name: Upload artifacts | |
| if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-${{ matrix.flag }}-aggregated | |
| path: | | |
| .coverage | |
| include-hidden-files: true | |
| DCO_merge_group: | |
| name: DCO | |
| if: github.event_name == 'merge_group' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - run: echo "The real DCO check happens on PRs only. This is a placeholder for the merge queue to keep the DCO check as a required status check." |