Skip to content

refactor: refactor loss function #10273

refactor: refactor loss function

refactor: refactor loss function #10273

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "CICD NeMo RL"
on:
pull_request:
branches:
- "main"
- "r**"
types: [labeled, opened, synchronize, reopened]
merge_group:
types: [checks_requested]
schedule:
- cron: "0 9 * * *"
workflow_dispatch:
inputs:
test_to_run:
required: false
default: L2
type: choice
options:
- docs
- Lfast
- L0
- L1
- L2
description: Test level to run. docs = doc tests only, Lfast = fast subset (reuses main container), L0 = unit/docs/lint, L1 = L0 + functional, L2 = L1 + convergence
image_tag:
description: "Override container image tag (e.g. 'main'). Skips container build."
required: false
default: ""
# TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
#push:
# branches:
# - 'main'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}
cancel-in-progress: true
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
test_level: ${{ steps.evaluate.outputs.test_level }}
image_tag: ${{ steps.evaluate.outputs.image_tag }}
steps:
- name: Get changed files
id: changed-files
if: github.event_name == 'pull_request'
uses: step-security/changed-files@v45.0.1
with:
files_yaml: |
doc:
- '**.md'
- docs/**
src:
- '!**.md'
- '!docs/**'
- name: Evaluate conditions
id: evaluate
env:
DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }}
CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }}
CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }}
IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }}
LABEL: ${{ github.event.label.name }}
MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
run: |
# Some output that's helpful for debugging
echo "Docs changed: $CHANGED_DOCS"
echo "Src changed: $CHANGED_SRC"
echo "LABEL: $LABEL"
echo "IS_PULLREQUEST: $IS_PULLREQUEST"
echo "DOCS_ONLY: $DOCS_ONLY"
# Run CI only (on main or if label is attached) and if it's not only docs
# Determine test level based on conditions
if [[ "$DOCS_ONLY" == "true" || "$LABEL" == "CI:docs" ]]; then
# For doc-only changes, run only doc tests
TEST_LEVEL="docs"
elif [[ "$LABEL" == "CI:Lfast" ]]; then
TEST_LEVEL="Lfast"
elif [[ "$LABEL" == "CI:L0" ]]; then
TEST_LEVEL="L0"
elif [[ "$LABEL" == "CI:L1" || "$IS_PULLREQUEST" == "false" || "$MERGE_GROUP" == "true" ]]; then
# For labeled PRs, pushes to main (IS_PULL_REQUEST=false), or merge group events, run L1 by default
TEST_LEVEL="L1"
elif [[ "$LABEL" == "CI:L2" ]]; then
TEST_LEVEL="L2"
else
# Skip tests by default for non-labeled PRs
TEST_LEVEL="none"
fi
if [[ "${{ github.event_name }}" == "schedule" ]]; then
echo "Setting test level to L1 for nightly scheduled run"
TEST_LEVEL="L1"
fi
# Override test level if specified in workflow_dispatch
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "Overriding test level from $TEST_LEVEL to ${{ inputs.test_to_run }}"
TEST_LEVEL="${{ inputs.test_to_run }}"
fi
echo "test_level=$TEST_LEVEL" | tee -a "$GITHUB_OUTPUT"
# Determine image tag: Lfast uses main, workflow_dispatch can override
IMAGE_TAG=""
if [[ "$TEST_LEVEL" == "Lfast" ]]; then
IMAGE_TAG="main"
fi
if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ inputs.image_tag }}" ]]; then
IMAGE_TAG="${{ inputs.image_tag }}"
fi
echo "image_tag=$IMAGE_TAG" | tee -a "$GITHUB_OUTPUT"
pr-branch-up-to-date-check:
name: Check if PR branch is up to date
needs: [pre-flight]
if: ${{ github.event_name == 'pull_request' }}
runs-on: ubuntu-latest
env:
MAX_COMMITS_BEHIND: 10
steps:
- name: Check how many commits behind target branch
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
BASE_REF: ${{ github.base_ref }}
HEAD_LABEL: ${{ github.event.pull_request.head.label }}
run: |
echo "Repository: $REPO"
echo "Base branch: $BASE_REF (SHA: $BASE_SHA)"
echo "PR head: $HEAD_LABEL (SHA: $HEAD_SHA)"
echo "Maximum commits behind allowed: $MAX_COMMITS_BEHIND"
API_RESPONSE=$(gh api "repos/$REPO/compare/$HEAD_SHA...$BASE_REF" --jq '{behind_by: .behind_by, ahead_by: .ahead_by, status: .status}')
COMMITS_BEHIND=$(echo "$API_RESPONSE" | jq -r '.ahead_by')
COMMITS_AHEAD=$(echo "$API_RESPONSE" | jq -r '.behind_by')
STATUS=$(echo "$API_RESPONSE" | jq -r '.status')
echo "Comparison status: $STATUS"
echo "PR is $COMMITS_BEHIND commits behind and $COMMITS_AHEAD commits ahead of $BASE_REF"
# Check if we're behind by more than the allowed number
if [ "$COMMITS_BEHIND" -gt "$MAX_COMMITS_BEHIND" ]; then
echo "❌ ERROR: This PR is $COMMITS_BEHIND commits behind $BASE_REF, which exceeds the maximum allowed ($MAX_COMMITS_BEHIND commits)."
echo "Please rebase or merge the latest changes from $BASE_REF into your PR branch."
exit 1
else
echo "✅ PR is acceptably fresh ($COMMITS_BEHIND commits behind, limit is $MAX_COMMITS_BEHIND)"
fi
lint-check:
name: Lint check
needs: [pre-flight]
runs-on: ubuntu-latest
steps:
- name: Free up disk space
run: |
# Remove unnecessary packages and files on Ubuntu
sudo apt-get clean
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /opt/az || true
# Clear pip and npm caches
pip cache purge || true
sudo npm cache clean --force || true
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: 'recursive'
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "0.9.1"
enable-cache: true
prune-cache: false
# Faster than uv python install since it caches python alongside runner
- name: "Set up Python"
uses: actions/setup-python@v5
with:
python-version-file: ".python-version"
- name: Check lint
run: |
uv venv
uv run --group dev pre-commit install
uv run --group dev pre-commit run --all-files --show-diff-on-failure --color=always
# TODO: this is a temporary check and should be removed once we have 100% correctness
- name: Check if any files with zero errors not in whitelist
run: |
missing_count=0
for file in $(uv run --group dev pyrefly check $(git ls-files 'nemo_rl/**/*.py' 'examples/**/*.py' 'docs/*.py' 'tools/**/*.py') --output-format json | jq -r --slurpfile all_files <(git ls-files 'nemo_rl/**/*.py' 'examples/**/*.py' 'docs/*.py' 'tools/**/*.py' | jq -R -s 'split("\n")[:-1]') --arg pwd "$(pwd)/" '(.errors | group_by(.path) | map({(.[0].path | sub($pwd; "")): length}) | add // {}) as $error_counts | $all_files[0][] | . as $file | if ($error_counts[$file] // 0) == 0 then $file else empty end'); do
if ! fgrep -q "$file" pyrefly.toml; then
echo "File $file has zero errors but is not in pyrefly.toml in the 'project-includes' list. Please add it to this whitelist."
((missing_count++))
fi
done
exit $missing_count
- name: Minimize uv cache
run: uv cache prune --ci
sphinx-build:
needs: [pre-flight]
if: ${{ needs.pre-flight.outputs.test_level != 'none' }}
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
build-container:
if: ${{ needs.pre-flight.outputs.test_level != 'none' && needs.pre-flight.outputs.image_tag == '' }}
needs: [pre-flight]
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0
with:
build-ref: ${{ github.sha }}
image-name: nemo_rl_container
dockerfile: docker/Dockerfile
image-label: nemo-rl
target: release
build-contexts: |
nemo-rl=${{ github.run_id }}/
build-args: |
MAX_JOBS=4
NEMO_RL_COMMIT=${{ github.sha }}
cicd-doc-tests:
strategy:
fail-fast: false
matrix:
include:
- script: Docs_Tests
runner: self-hosted-azure
needs: [pre-flight, build-container]
if: ${{ contains('docs L0 L1 L2', needs.pre-flight.outputs.test_level) }}
runs-on: ${{ matrix.runner }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
is_doc_test: "true"
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}
cicd-unit-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L0_Unit_Tests_Generation
runner: self-hosted-azure
- script: L0_Unit_Tests_Policy
runner: self-hosted-azure
- script: L0_Unit_Tests_Other
runner: self-hosted-azure
needs: [pre-flight, build-container, cicd-doc-tests]
if: >-
${{
always() &&
contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) &&
needs.pre-flight.result == 'success' &&
(needs.build-container.result == 'success' || needs.build-container.result == 'skipped') &&
(needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped')
}}
runs-on: ${{ matrix.runner }}
name: ${{ matrix.script }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
image-tag: ${{ needs.pre-flight.outputs.image_tag }}
is_unit_test: "true"
cpu-only: ${{ matrix.cpu-only || false }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}
cicd-functional-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L1_Functional_Tests_GPU
runner: self-hosted-azure
needs: [pre-flight, build-container, cicd-unit-tests]
runs-on: ${{ matrix.runner }}
if: ${{ contains('L1 L2', needs.pre-flight.outputs.test_level) }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}
cicd-fast-functional-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L1_Functional_Tests_GPU
runner: self-hosted-azure
needs: [pre-flight]
if: ${{ needs.pre-flight.outputs.test_level == 'Lfast' }}
runs-on: ${{ matrix.runner }}
name: fast_${{ matrix.script }}
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
image-tag: ${{ needs.pre-flight.outputs.image_tag }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}
CI_QA_Gate:
name: "CI quality check${{ needs.pre-flight.outputs.test_level == 'none' && ' (No tests run: Label CI:L*)' || '' }}"
if: always()
runs-on: ubuntu-latest
needs:
- pre-flight
- pr-branch-up-to-date-check
- lint-check
- sphinx-build
- build-container
- cicd-doc-tests
- cicd-unit-tests
- cicd-functional-tests
- cicd-fast-functional-tests
steps:
- name: main
env:
JOB_RESULTS: ${{ toJSON(needs) }}
# Job is considered successful if nothing was run, or if all jobs were successful (the tests run even if only docs were run b/c doctests are selected)
ALL_SUCCESS: >-
${{
needs.lint-check.result == 'success' &&
(needs.pr-branch-up-to-date-check.result == 'success' || needs.pr-branch-up-to-date-check.result == 'skipped') &&
(
needs.pre-flight.outputs.test_level != 'none' &&
needs.sphinx-build.result == 'success' &&
(needs.build-container.result == 'success' || needs.build-container.result == 'skipped') &&
(
(
(needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped') &&
(needs.cicd-unit-tests.result == 'skipped' || needs.cicd-unit-tests.result == 'success') &&
(needs.cicd-functional-tests.result == 'skipped' || needs.cicd-functional-tests.result == 'success') &&
(needs.cicd-fast-functional-tests.result == 'skipped' || needs.cicd-fast-functional-tests.result == 'success')
)
)
)
}}
CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }}
TEST_LEVEL: ${{ needs.pre-flight.outputs.test_level }}
run: |
SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"')
echo '🤖: CICD Result for test level: ${{ needs.pre-flight.outputs.test_level }}' >> $GITHUB_STEP_SUMMARY
echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
test "$ALL_SUCCESS" = "true" || test "$CI_SKIP" = "true"
notify-nightly-failure:
name: Notify nightly test failure
runs-on: ubuntu-latest
needs: [CI_QA_Gate]
environment: main
if: ${{ always() && github.event_name == 'schedule' && needs.CI_QA_Gate.result == 'failure' }}
steps:
- name: Send Slack notification
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_TEAM_CHANNEL_WEBHOOK }}
run: |
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "🚨 Nightly GitHub CI test failed on main branch\n\n• Repository: ${{ github.repository }}\n• Commit: `${{ github.sha }}`\n• Workflow: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Run>"
}
}
]
}'
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" "$SLACK_WEBHOOK"
Coverage:
runs-on: ubuntu-latest
needs:
- CI_QA_Gate
- cicd-doc-tests
- cicd-unit-tests
- cicd-functional-tests
if: always()
strategy:
matrix:
flag: [doc-test, unit-test, e2e]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: Check if artifacts were downloaded
id: check-artifacts
run: |
# Check if any coverage directories were downloaded
if ls coverage-* 1> /dev/null 2>&1; then
echo "artifacts-found=true" >> $GITHUB_OUTPUT
echo "Found coverage artifacts for ${{ matrix.flag }}"
else
echo "artifacts-found=false" >> $GITHUB_OUTPUT
echo "No coverage artifacts found for ${{ matrix.flag }}"
fi
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
run: |
pip install coverage
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i --show-missing
rm -rf coverage-*
ls -al
- name: Skip coverage processing
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'false' }}
run: |
echo "No coverage artifacts found for ${{ matrix.flag }}, skipping coverage processing"
- name: Upload coverage reports to Codecov
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true
DCO_merge_group:
name: DCO
if: github.event_name == 'merge_group'
runs-on: ubuntu-latest
steps:
- run: echo "The real DCO check happens on PRs only. This is a placeholder for the merge queue to keep the DCO check as a required status check."