refactor: refactor loss function #10273

Workflow file for this run

.github/workflows/cicd-main.yml at 443d7ad

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	name: "CICD NeMo RL"

	on:
	pull_request:
	branches:
	- "main"
	- "r**"
	types: [labeled, opened, synchronize, reopened]
	merge_group:
	types: [checks_requested]
	schedule:
	- cron: "0 9 * * *"
	workflow_dispatch:
	inputs:
	test_to_run:
	required: false
	default: L2
	type: choice
	options:
	- docs
	- Lfast
	- L0
	- L1
	- L2
	description: Test level to run. docs = doc tests only, Lfast = fast subset (reuses main container), L0 = unit/docs/lint, L1 = L0 + functional, L2 = L1 + convergence
	image_tag:
	description: "Override container image tag (e.g. 'main'). Skips container build."
	required: false
	default: ""
	# TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
	#push:
	# branches:
	# - 'main'

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}-${{ github.event.label.name \|\| 'main' }}
	cancel-in-progress: true

	jobs:
	pre-flight:
	runs-on: ubuntu-latest
	outputs:
	test_level: ${{ steps.evaluate.outputs.test_level }}
	image_tag: ${{ steps.evaluate.outputs.image_tag }}
	steps:
	- name: Get changed files
	id: changed-files
	if: github.event_name == 'pull_request'
	uses: step-security/changed-files@v45.0.1
	with:
	files_yaml: \|
	doc:
	- '**.md'
	- docs/**
	src:
	- '!**.md'
	- '!docs/**'

	- name: Evaluate conditions
	id: evaluate
	env:
	DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }}
	CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }}
	CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }}
	IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }}
	LABEL: ${{ github.event.label.name }}
	MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
	run: \|
	# Some output that's helpful for debugging
	echo "Docs changed: $CHANGED_DOCS"
	echo "Src changed: $CHANGED_SRC"
	echo "LABEL: $LABEL"
	echo "IS_PULLREQUEST: $IS_PULLREQUEST"
	echo "DOCS_ONLY: $DOCS_ONLY"

	# Run CI only (on main or if label is attached) and if it's not only docs
	# Determine test level based on conditions
	if [[ "$DOCS_ONLY" == "true" \|\| "$LABEL" == "CI:docs" ]]; then
	# For doc-only changes, run only doc tests
	TEST_LEVEL="docs"
	elif [[ "$LABEL" == "CI:Lfast" ]]; then
	TEST_LEVEL="Lfast"
	elif [[ "$LABEL" == "CI:L0" ]]; then
	TEST_LEVEL="L0"
	elif [[ "$LABEL" == "CI:L1" \|\| "$IS_PULLREQUEST" == "false" \|\| "$MERGE_GROUP" == "true" ]]; then
	# For labeled PRs, pushes to main (IS_PULL_REQUEST=false), or merge group events, run L1 by default
	TEST_LEVEL="L1"
	elif [[ "$LABEL" == "CI:L2" ]]; then
	TEST_LEVEL="L2"
	else
	# Skip tests by default for non-labeled PRs
	TEST_LEVEL="none"
	fi

	if [[ "${{ github.event_name }}" == "schedule" ]]; then
	echo "Setting test level to L1 for nightly scheduled run"
	TEST_LEVEL="L1"
	fi

	# Override test level if specified in workflow_dispatch
	if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo "Overriding test level from $TEST_LEVEL to ${{ inputs.test_to_run }}"
	TEST_LEVEL="${{ inputs.test_to_run }}"
	fi

	echo "test_level=$TEST_LEVEL" \| tee -a "$GITHUB_OUTPUT"

	# Determine image tag: Lfast uses main, workflow_dispatch can override
	IMAGE_TAG=""
	if [[ "$TEST_LEVEL" == "Lfast" ]]; then
	IMAGE_TAG="main"
	fi
	if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ inputs.image_tag }}" ]]; then
	IMAGE_TAG="${{ inputs.image_tag }}"
	fi
	echo "image_tag=$IMAGE_TAG" \| tee -a "$GITHUB_OUTPUT"

	pr-branch-up-to-date-check:
	name: Check if PR branch is up to date
	needs: [pre-flight]
	if: ${{ github.event_name == 'pull_request' }}
	runs-on: ubuntu-latest
	env:
	MAX_COMMITS_BEHIND: 10
	steps:
	- name: Check how many commits behind target branch
	env:
	GH_TOKEN: ${{ github.token }}
	REPO: ${{ github.repository }}
	BASE_SHA: ${{ github.event.pull_request.base.sha }}
	HEAD_SHA: ${{ github.event.pull_request.head.sha }}
	BASE_REF: ${{ github.base_ref }}
	HEAD_LABEL: ${{ github.event.pull_request.head.label }}
	run: \|
	echo "Repository: $REPO"
	echo "Base branch: $BASE_REF (SHA: $BASE_SHA)"
	echo "PR head: $HEAD_LABEL (SHA: $HEAD_SHA)"
	echo "Maximum commits behind allowed: $MAX_COMMITS_BEHIND"

	API_RESPONSE=$(gh api "repos/$REPO/compare/$HEAD_SHA...$BASE_REF" --jq '{behind_by: .behind_by, ahead_by: .ahead_by, status: .status}')

	COMMITS_BEHIND=$(echo "$API_RESPONSE" \| jq -r '.ahead_by')
	COMMITS_AHEAD=$(echo "$API_RESPONSE" \| jq -r '.behind_by')
	STATUS=$(echo "$API_RESPONSE" \| jq -r '.status')

	echo "Comparison status: $STATUS"
	echo "PR is $COMMITS_BEHIND commits behind and $COMMITS_AHEAD commits ahead of $BASE_REF"

	# Check if we're behind by more than the allowed number
	if [ "$COMMITS_BEHIND" -gt "$MAX_COMMITS_BEHIND" ]; then
	echo "❌ ERROR: This PR is $COMMITS_BEHIND commits behind $BASE_REF, which exceeds the maximum allowed ($MAX_COMMITS_BEHIND commits)."
	echo "Please rebase or merge the latest changes from $BASE_REF into your PR branch."
	exit 1
	else
	echo "✅ PR is acceptably fresh ($COMMITS_BEHIND commits behind, limit is $MAX_COMMITS_BEHIND)"
	fi

	lint-check:
	name: Lint check
	needs: [pre-flight]
	runs-on: ubuntu-latest
	steps:
	- name: Free up disk space
	run: \|
	# Remove unnecessary packages and files on Ubuntu
	sudo apt-get clean
	sudo rm -rf /usr/local/lib/android \|\| true
	sudo rm -rf /opt/ghc \|\| true
	sudo rm -rf /usr/local/.ghcup \|\| true
	sudo rm -rf /usr/share/dotnet \|\| true
	sudo rm -rf /opt/az \|\| true
	# Clear pip and npm caches
	pip cache purge \|\| true
	sudo npm cache clean --force \|\| true
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	submodules: 'recursive'
	- name: Install uv
	uses: astral-sh/setup-uv@v5
	with:
	version: "0.9.1"
	enable-cache: true
	prune-cache: false
	# Faster than uv python install since it caches python alongside runner
	- name: "Set up Python"
	uses: actions/setup-python@v5
	with:
	python-version-file: ".python-version"
	- name: Check lint
	run: \|
	uv venv
	uv run --group dev pre-commit install
	uv run --group dev pre-commit run --all-files --show-diff-on-failure --color=always
	# TODO: this is a temporary check and should be removed once we have 100% correctness
	- name: Check if any files with zero errors not in whitelist
	run: \|
	missing_count=0
	for file in $(uv run --group dev pyrefly check $(git ls-files 'nemo_rl/*/.py' 'examples/*/.py' 'docs/.py' 'tools//.py') --output-format json \| jq -r --slurpfile all_files <(git ls-files 'nemo_rl/*/.py' 'examples/*/.py' 'docs/.py' 'tools//.py' \| jq -R -s 'split("\n")[:-1]') --arg pwd "$(pwd)/" '(.errors \| group_by(.path) \| map({(.[0].path \| sub($pwd; "")): length}) \| add // {}) as $error_counts \| $all_files[0][] \| . as $file \| if ($error_counts[$file] // 0) == 0 then $file else empty end'); do
	if ! fgrep -q "$file" pyrefly.toml; then
	echo "File $file has zero errors but is not in pyrefly.toml in the 'project-includes' list. Please add it to this whitelist."
	((missing_count++))
	fi
	done

	exit $missing_count
	- name: Minimize uv cache
	run: uv cache prune --ci

	sphinx-build:
	needs: [pre-flight]
	if: ${{ needs.pre-flight.outputs.test_level != 'none' }}
	uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0

	build-container:
	if: ${{ needs.pre-flight.outputs.test_level != 'none' && needs.pre-flight.outputs.image_tag == '' }}
	needs: [pre-flight]
	uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0
	with:
	build-ref: ${{ github.sha }}
	image-name: nemo_rl_container
	dockerfile: docker/Dockerfile
	image-label: nemo-rl
	target: release
	build-contexts: \|
	nemo-rl=${{ github.run_id }}/
	build-args: \|
	MAX_JOBS=4
	NEMO_RL_COMMIT=${{ github.sha }}

	cicd-doc-tests:
	strategy:
	fail-fast: false
	matrix:
	include:
	- script: Docs_Tests
	runner: self-hosted-azure
	needs: [pre-flight, build-container]
	if: ${{ contains('docs L0 L1 L2', needs.pre-flight.outputs.test_level) }}
	runs-on: ${{ matrix.runner }}
	name: ${{ matrix.is_optional && 'PLEASEFIXME_' \|\| '' }}${{ matrix.script }}
	environment: nemo-ci
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: main
	uses: ./.github/actions/test-template
	with:
	runner: ${{ runner.name }}
	script: ${{ matrix.script }}
	is_doc_test: "true"
	is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

	cicd-unit-tests:
	strategy:
	fail-fast: false
	matrix:
	include:
	- script: L0_Unit_Tests_Generation
	runner: self-hosted-azure
	- script: L0_Unit_Tests_Policy
	runner: self-hosted-azure
	- script: L0_Unit_Tests_Other
	runner: self-hosted-azure
	needs: [pre-flight, build-container, cicd-doc-tests]
	if: >-
	${{
	always() &&
	contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) &&
	needs.pre-flight.result == 'success' &&
	(needs.build-container.result == 'success' \|\| needs.build-container.result == 'skipped') &&
	(needs.cicd-doc-tests.result == 'success' \|\| needs.cicd-doc-tests.result == 'skipped')
	}}
	runs-on: ${{ matrix.runner }}
	name: ${{ matrix.script }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: main
	uses: ./.github/actions/test-template
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	with:
	runner: ${{ runner.name }}
	script: ${{ matrix.script }}
	image-tag: ${{ needs.pre-flight.outputs.image_tag }}
	is_unit_test: "true"
	cpu-only: ${{ matrix.cpu-only \|\| false }}
	is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

	cicd-functional-tests:
	strategy:
	fail-fast: false
	matrix:
	include:
	- script: L1_Functional_Tests_GPU
	runner: self-hosted-azure
	needs: [pre-flight, build-container, cicd-unit-tests]
	runs-on: ${{ matrix.runner }}
	if: ${{ contains('L1 L2', needs.pre-flight.outputs.test_level) }}
	name: ${{ matrix.is_optional && 'PLEASEFIXME_' \|\| '' }}${{ matrix.script }}
	environment: nemo-ci
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: main
	uses: ./.github/actions/test-template
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	with:
	runner: ${{ runner.name }}
	script: ${{ matrix.script }}
	is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

	cicd-fast-functional-tests:
	strategy:
	fail-fast: false
	matrix:
	include:
	- script: L1_Functional_Tests_GPU
	runner: self-hosted-azure
	needs: [pre-flight]
	if: ${{ needs.pre-flight.outputs.test_level == 'Lfast' }}
	runs-on: ${{ matrix.runner }}
	name: fast_${{ matrix.script }}
	environment: nemo-ci
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	- name: main
	uses: ./.github/actions/test-template
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	with:
	runner: ${{ runner.name }}
	script: ${{ matrix.script }}
	image-tag: ${{ needs.pre-flight.outputs.image_tag }}
	is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

	CI_QA_Gate:
	name: "CI quality check${{ needs.pre-flight.outputs.test_level == 'none' && ' (No tests run: Label CI:L*)' \|\| '' }}"
	if: always()
	runs-on: ubuntu-latest
	needs:
	- pre-flight
	- pr-branch-up-to-date-check
	- lint-check
	- sphinx-build
	- build-container
	- cicd-doc-tests
	- cicd-unit-tests
	- cicd-functional-tests
	- cicd-fast-functional-tests
	steps:
	- name: main
	env:
	JOB_RESULTS: ${{ toJSON(needs) }}
	# Job is considered successful if nothing was run, or if all jobs were successful (the tests run even if only docs were run b/c doctests are selected)
	ALL_SUCCESS: >-
	${{
	needs.lint-check.result == 'success' &&
	(needs.pr-branch-up-to-date-check.result == 'success' \|\| needs.pr-branch-up-to-date-check.result == 'skipped') &&
	(
	needs.pre-flight.outputs.test_level != 'none' &&
	needs.sphinx-build.result == 'success' &&
	(needs.build-container.result == 'success' \|\| needs.build-container.result == 'skipped') &&
	(
	(
	(needs.cicd-doc-tests.result == 'success' \|\| needs.cicd-doc-tests.result == 'skipped') &&
	(needs.cicd-unit-tests.result == 'skipped' \|\| needs.cicd-unit-tests.result == 'success') &&
	(needs.cicd-functional-tests.result == 'skipped' \|\| needs.cicd-functional-tests.result == 'success') &&
	(needs.cicd-fast-functional-tests.result == 'skipped' \|\| needs.cicd-fast-functional-tests.result == 'success')
	)
	)
	)
	}}
	CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }}
	TEST_LEVEL: ${{ needs.pre-flight.outputs.test_level }}
	run: \|
	SUMMARY=$(echo $JOB_RESULTS \| jq 'to_entries[] \| .key + ": " + .value.result' \| tr -d '"')
	echo '🤖: CICD Result for test level: ${{ needs.pre-flight.outputs.test_level }}' >> $GITHUB_STEP_SUMMARY
	echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
	test "$ALL_SUCCESS" = "true" \|\| test "$CI_SKIP" = "true"

	notify-nightly-failure:
	name: Notify nightly test failure
	runs-on: ubuntu-latest
	needs: [CI_QA_Gate]
	environment: main
	if: ${{ always() && github.event_name == 'schedule' && needs.CI_QA_Gate.result == 'failure' }}
	steps:
	- name: Send Slack notification
	env:
	SLACK_WEBHOOK: ${{ secrets.SLACK_TEAM_CHANNEL_WEBHOOK }}
	run: \|
	MESSAGE='{
	"blocks": [
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "🚨 Nightly GitHub CI test failed on main branch\n\n• Repository: ${{ github.repository }}\n• Commit: `${{ github.sha }}`\n• Workflow: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\|View Run>"
	}
	}
	]
	}'

	curl -X POST -H "Content-type: application/json" --data "$MESSAGE" "$SLACK_WEBHOOK"

	Coverage:
	runs-on: ubuntu-latest
	needs:
	- CI_QA_Gate
	- cicd-doc-tests
	- cicd-unit-tests
	- cicd-functional-tests
	if: always()
	strategy:
	matrix:
	flag: [doc-test, unit-test, e2e]
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Download coverage reports of current branch
	uses: actions/download-artifact@v4
	with:
	pattern: coverage-${{ matrix.flag }}-*

	- name: Check if artifacts were downloaded
	id: check-artifacts
	run: \|
	# Check if any coverage directories were downloaded
	if ls coverage-* 1> /dev/null 2>&1; then
	echo "artifacts-found=true" >> $GITHUB_OUTPUT
	echo "Found coverage artifacts for ${{ matrix.flag }}"
	else
	echo "artifacts-found=false" >> $GITHUB_OUTPUT
	echo "No coverage artifacts found for ${{ matrix.flag }}"
	fi

	- name: Get total coverage of current branch
	shell: bash -x -e -u -o pipefail {0}
	if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
	run: \|
	pip install coverage

	ls -al .
	ls -al coverage-*/
	coverage combine --keep $(ls coverage-*/.coverage)
	coverage report -i --show-missing
	rm -rf coverage-*
	ls -al

	- name: Skip coverage processing
	if: ${{ steps.check-artifacts.outputs.artifacts-found == 'false' }}
	run: \|
	echo "No coverage artifacts found for ${{ matrix.flag }}, skipping coverage processing"

	- name: Upload coverage reports to Codecov
	if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
	uses: codecov/codecov-action@v5
	with:
	token: ${{ secrets.CODECOV_TOKEN }}
	verbose: true
	flags: ${{ matrix.flag }}

	- name: Upload artifacts
	if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}
	uses: actions/upload-artifact@v4
	with:
	name: coverage-${{ matrix.flag }}-aggregated
	path: \|
	.coverage
	include-hidden-files: true

	DCO_merge_group:
	name: DCO
	if: github.event_name == 'merge_group'
	runs-on: ubuntu-latest
	steps:
	- run: echo "The real DCO check happens on PRs only. This is a placeholder for the merge queue to keep the DCO check as a required status check."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

refactor: refactor loss function #10273

Workflow file

refactor: refactor loss function #10273

Uh oh!

Workflow file for this run