NVIDIA-NeMo
diff --git a/‎.coderabbit.yaml‎
Lines changed: 102 additions & 0 deletions b/‎.coderabbit.yaml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎.dockerignore‎
Lines changed: 0 additions & 1 deletion b/‎.dockerignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/actions/test-template/action.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actions/test-template/action.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 56 additions & 50 deletions b/‎.github/workflows/build-test-publish-wheel.yml‎
Lines changed: 56 additions & 50 deletions
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 16 additions & 14 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 16 additions & 14 deletions
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+# https://docs.coderabbit.ai/getting-started/configure-coderabbit/
+# Validator https://docs.coderabbit.ai/configuration/yaml-validator#yaml-validator
+# In PR, comment "@coderabbitai configuration" to get the full config including defaults
+# Set the language for reviews by using the corresponding ISO language code.
+# Default: "en-US"
+language: "en-US"
+# Settings related to reviews.
+# Default: {}
+reviews:
+  # Set the profile for reviews. Assertive profile yields more feedback, that may be considered nitpicky.
+  # Options: chill, assertive
+  # Default: "chill"
+  profile: chill
+  # Add this keyword in the PR/MR title to auto-generate the title.
+  # Default: "@coderabbitai"
+  auto_title_placeholder: '@coderabbitai title'
+  # Auto Title Instructions - Custom instructions for auto-generating the PR/MR title.
+  # Default: ""
+  auto_title_instructions: 'Format: "[{modules}] {type}: {description}". Modules: model, recipe, training, data, ckpt, peft, perf, ci, doc, test, build, misc. Use comma to separate multiple modules. Type must be one of: feat, fix, refactor, chore, test. Title should be concise (<= 80 chars). Example: "[model] feat: Add Qwen3 model bridge" or "[recipe, doc] feat: Add Llama 3.1 70B recipe".'
+  # Set the commit status to 'pending' when the review is in progress and 'success' when it is complete.
+  # Default: true
+  commit_status: false
+  # Generate walkthrough in a markdown collapsible section.
+  # Default: false
+  collapse_walkthrough: true
+  # Generate an assessment of how well the changes address the linked issues in the walkthrough.
+  # Default: true
+  assess_linked_issues: true
+  # Include possibly related issues in the walkthrough.
+  # Default: true
+  related_issues: true
+  # Related PRs - Include possibly related pull requests in the walkthrough.
+  # Default: true
+  related_prs: true
+  # Suggest labels based on the changes in the pull request in the walkthrough.
+  # Default: true
+  suggested_labels: true
+  # Suggest reviewers based on the changes in the pull request in the walkthrough.
+  # Default: true
+  suggested_reviewers: true
+  # Generate a poem in the walkthrough comment.
+  # Default: true
+  poem: false
+  # Post review details on each review. Additionally, post a review status when a review is skipped in certain cases.
+  # Default: true
+  review_status: false
+  # Configuration for pre merge checks
+  # Default: {}
+  pre_merge_checks:
+    # Custom Pre-merge Checks - Add unique checks to enforce your team's standards before merging a pull request. Each check must have a unique name (up to 50 characters) and clear instructions (up to 10000 characters). Use these to automatically verify coding, security, documentation, or business rules and maintain code quality.
+    # Default: []
+    custom_checks:
+      - name: "Test Results for Major Changes"
+        mode: "warning"  # or "error" to block merges
+        instructions: |
+          If this PR contains major changes (such as new features, breaking changes, or significant refactoring), verify that the PR description includes test results or testing information.
+          If a change could affect numerics or convergence, the PR description should include information demonstrating that there is no regression.
+          If a change could affect performance, the PR description should include before-and-after performance numbers, as well as the configuration and context in which they apply.
+          Pass if test results are documented or if the changes are minor.
+  auto_review:
+    # Configuration for auto review
+    # Default: {}
+    # Automatic Incremental Review - Automatic incremental code review on each push
+    # Default: true
+    auto_incremental_review: false
+    # Review draft PRs/MRs.
+    # Default: false
+    drafts: false
+    # Base branches (other than the default branch) to review. Accepts regex patterns. Use '.*' to match all branches.
+    # Default: []
+    base_branches: ["main", "r[0-9].*"]
+# Configuration for knowledge base
+# Default: {}
+knowledge_base:
+  code_guidelines:
+    # CodeRabbit will analyse and learn from your organization's code guidelines, which you can mention in the file patterns section. These guidelines will then be used to conduct thorough code reviews.
+    # Default: {}
+    enabled: true
+    # Enabled - Enable CodeRabbit to enforce your organization's coding standards during reviews.
+    # Default: true
+    filePatterns:
+      # File Patterns - Specify files for your coding guideline documents in this section. CodeRabbit will scan these files to understand your team's standards and apply them during code reviews. Multiple files supported. File names are case-sensitive. Common files like: (**/.cursorrules, .github/copilot-instructions.md, .github/instructions/*.instructions.md, **/CLAUDE.md, **/GEMINI.md, **/.cursor/rules/*, **/.windsurfrules, **/.clinerules/*, **/.rules/*, **/AGENT.md, **/AGENTS.md) are included by default.
+      # Default: []
+      - "**/CODING_GUIDELINES.md"
+      - "**/.cursor/rules/*"
+      - "**/CONTRIBUTING.md"
@@ -14,6 +14,5 @@ nosetests.xml
 coverage.xml
 *,cover
 *.log
-.git
 **/*.nemo
 **/*.ckpt
@@ -132,6 +132,7 @@ runs:
           --runtime=nvidia --gpus all \
           --shm-size=64g \
           --cpus=40 \
+          --env GHA_RUNNER=${{ inputs.runner }} \
           --env HYDRA_FULL_ERROR=1 \
           --env HF_HOME=/home/TestData/HF_HOME \
           --env NEMO_HOME=/home/TestData/nemo_home \
 
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 name: Build, test, and publish a PyPi wheel (to testpypi).
 
 on:
@@ -35,55 +34,62 @@ concurrency:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
+    with:
+      default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
+      non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
+      default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
+      non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
+    secrets:
+      NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
 
-  # build-test-publish-wheel:
-  #   needs: [pre-flight]
-  #   if: |
-  #     !(needs.pre-flight.outputs.docs_only == 'true'
-  #     || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-  #   uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.65.1
-  #   with:
-  #     dry-run: true
-  #     python-package: megatron.bridge
-  #     python-version: "3.10"
-  #     packaging: uv
-  #     no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
-  #     has-src-dir: true
-  #     skip-test-wheel: true
-  #     custom-container: nvcr.io/nvidia/pytorch:25.05-py3
-  #     runner: self-hosted-nemo
-  #     no-build-isolation: true
-  #     submodules: recursive
-  #     container-options: "--gpus all --runtime=nvidia"
-  #   secrets:
-  #     TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-  #     TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-  #     SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
-  #     SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-  #     GH_TOKEN: ${{ secrets.PAT }}
+  build-test-publish-wheel:
+    needs: [pre-flight]
+    if: |
+      !(needs.pre-flight.outputs.docs_only == 'true'
+      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.70.1
+    with:
+      dry-run: true
+      python-package: megatron.bridge
+      python-version: "3.10"
+      packaging: uv
+      no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
+      has-src-dir: true
+      skip-test-wheel: true
+      custom-container: nvcr.io/nvidia/pytorch:25.11-py3
+      runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
+      no-build-isolation: true
+      submodules: recursive
+      container-options: "--gpus all --runtime=nvidia"
+    secrets:
+      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      GH_TOKEN: ${{ secrets.PAT }}
 
-  # build-test-publish-wheel-summary:
-  #   needs: [pre-flight, build-test-publish-wheel]
-  #   if: |
-  #     (
-  #       needs.pre-flight.outputs.docs_only == 'true'
-  #       || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-  #       || always()
-  #     )
-  #     && !cancelled()
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Result
-  #       run: |
-  #         FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+  build-test-publish-wheel-summary:
+    needs: [pre-flight, build-test-publish-wheel]
+    if: |
+      (
+        needs.pre-flight.outputs.docs_only == 'true'
+        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || always()
+      )
+      && !cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Result
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
 
-  #         if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-  #             echo "✅ All previous jobs completed successfully"
-  #             exit 0
-  #         else
-  #             echo "❌ Found $FAILED_JOBS failed job(s)"
-  #             # Show which jobs failed
-  #             gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
-  #             exit 1
-  #         fi
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
@@ -25,32 +25,32 @@ on:
   workflow_dispatch:
     inputs:
       mcore_commit:
-        description: 'MCore commit SHA to test against'
+        description: "MCore commit SHA to test against"
         required: false
         type: string
       mcore_branch:
-        description: 'MCore branch name (for reference)'
+        description: "MCore branch name (for reference)"
         required: false
         type: string
       mcore_repo:
-        description: 'MCore repository URL (for fetching from forks)'
+        description: "MCore repository URL (for fetching from forks)"
         required: false
         type: string
-        default: 'https://github.com/NVIDIA/Megatron-LM.git'
+        default: "https://github.com/NVIDIA/Megatron-LM.git"
       test_suite:
-        description: 'Test suite to run'
+        description: "Test suite to run"
         required: false
         type: choice
         options:
-          - 'all'
-          - 'unit-only'
-          - 'functional-only'
-        default: 'all'
+          - "all"
+          - "unit-only"
+          - "functional-only"
+        default: "all"
       triggered_by:
-        description: 'Trigger source (for tracking)'
+        description: "Trigger source (for tracking)"
         required: false
         type: string
-        default: 'manual'
+        default: "manual"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
@@ -378,7 +378,7 @@ jobs:
       matrix:
         include:
           - script: L2_Launch_training
-            timeout: 40
+            timeout: 50
           - script: L2_Launch_converter
           - script: L2_Launch_models_deepseek
           - script: L2_Launch_models_gemma
@@ -393,19 +393,21 @@ jobs:
           - script: L2_Launch_models_nemotron_vl
           - script: L2_Launch_models_olmoe
           - script: L2_Launch_models_qwen
-          - script: L2_Launch_models_qwen_quantization
+          # - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
+          - script: L2_Launch_recipes_gemma_vl
+          - script: L2_Launch_recipes_gpt_oss
           - script: L2_Launch_recipes_llama_1b
           - script: L2_Launch_recipes_llama_3b
           - script: L2_Launch_recipes_llama_distill
-          - script: L2_Launch_recipes_mamba
           - script: L2_Launch_recipes_nemotronh
           - script: L2_Launch_recipes_qwen
           - script: L2_Launch_data
           - script: L2_Launch_post_training_quantization
           - script: L2_Launch_quantization_aware_training
           - script: L2_Launch_quantization_export
           - script: L2_Launch_recipes_llama_cuda_graphs
+          - script: L2_Launch_utils
     needs: [pre-flight, cicd-unit-tests]
     runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
     if: |