Skip to content

CICD NeMo

CICD NeMo #3875

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry: nemoci.azurecr.io
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
with:
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
secrets:
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
linting:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up UV
uses: astral-sh/setup-uv@v1
with:
version: 0.8.22
- name: Install ruff
env:
UV_PROJECT_ENVIRONMENT: ./venv
run: |
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
source ./venv/bin/activate
export PATH="./bin/:$PATH"
uv sync --link-mode copy --locked --group linting
- name: Run ruff
run: |
source ./venv/bin/activate
uv run --active ruff --version
uv run --active ruff check --verbose .
uv run --active ruff format --check --verbose .
import_linting:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up UV
uses: astral-sh/setup-uv@v1
with:
version: 0.8.22
- name: Install ruff
env:
UV_PROJECT_ENVIRONMENT: ./venv
run: |
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
source ./venv/bin/activate
export PATH="./bin/:$PATH"
uv sync --link-mode copy --locked --group linting
- name: Run import-linter
run: |
source ./venv/bin/activate
uv run lint-imports --debug --verbose --no-cache
Nemo_Linting_Test:
needs: [linting, import_linting]
runs-on: ubuntu-latest
if: always()
steps:
- name: Main
env:
LINTING_RESULT: ${{ toJson(needs.linting) }}
IMPORT_LINTING_RESULT: ${{ toJson(needs.import_linting) }}
run: |
LINTING=$(echo "$LINTING_RESULT" | jq -r '.result')
IMPORT_LINTING=$(echo "$IMPORT_LINTING_RESULT" | jq -r '.result')
if [[ "$LINTING" == "success" ]] && [[ "$IMPORT_LINTING" == "success" ]]; then
echo "All linting checks passed."
exit 0
else
echo "Some linting checks failed:"
echo " - linting: $LINTING"
echo " - import_linting: $IMPORT_LINTING"
exit 1
fi
cicd-wait-in-queue:
needs: [pre-flight]
runs-on: ubuntu-latest
environment: test
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-container-build:
needs: [pre-flight, cicd-wait-in-queue]
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/build-container
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
dockerfile-path: "./docker/Dockerfile"
has-azure-credentials: "true"
PAT: ${{ secrets.PAT }}
repo-name: "Automodel"
cicd-unit-tests:
strategy:
fail-fast: false
max-parallel: 5
matrix:
include:
- test-name: L0_Unit_Tests_CPU
runner: linux-amd64-cpu16
cpu-only: true
- test-name: L0_Unit_Tests_GPU
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
cpu-only: false
timeout: 30
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.test-name }}
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
with:
timeout: ${{ matrix.timeout || 10 }}
is-unit-test: "true"
cpu-only: ${{ matrix.cpu-only || false }}
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}
runner: ${{ matrix.runner }}
cicd-e2e-tests:
strategy:
fail-fast: false
max-parallel: 10
matrix:
include:
- test-name: L2_Pretrain_and_KD
test-folder: llm_pretrain_and_kd
timeout: 20
runner: self-hosted-nemo
test-data-path: "/mnt/datadrive/TestData"
- test-name: L2_HF_DCP
test-folder: hf_dcp
timeout: 40
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_HF_PEFT
test-folder: hf_peft
timeout: 30
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_HF_Transformer
test-folder: hf_transformer
timeout: 20
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_HF_Transformer_Finetune
test-folder: hf_transformer_finetune
timeout: 40
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_HF_Transformer_LLM
test-folder: hf_transformer_llm
timeout: 40
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_HF_Transformer_VLM
test-folder: hf_transformer_vlm
timeout: 60
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
- test-name: L2_Datasets
test-folder: data
timeout: 20
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
needs: [pre-flight, cicd-unit-tests]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.test-name }}
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/test-template
with:
test-folder: ${{ matrix.test-folder }}
timeout: ${{ matrix.timeout }}
is_unit_test: "false"
has-azure-credentials: "true"
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
test-data-path: ${{ matrix.test-data-path }}
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}
runner: ${{ matrix.runner }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-unit-tests
- cicd-e2e-tests
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
)
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& !cancelled()
environment: nemo-ci
steps:
- name: Generate fake coverage report
uses: actions/github-script@v6
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [pre-flight, Nemo_CICD_Test]
if: |
(
(needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
|| success()
)
&& !cancelled()
strategy:
matrix:
flag: [unit-test, e2e]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage[toml]
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true