NVIDIA · yuzhongw-nvidia · Oct 26, 2025 · Oct 26, 2025 · Oct 26, 2025 · Oct 26, 2025
@@ -1,59 +1,14 @@
-megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
-
-megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt
-
-megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal
-
-megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
-megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
-
-megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
-
-megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
-
-megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
-
-megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing
-
-megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer
-
-megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference
-
-megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
-
-megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism
-
-megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo
-
-megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech
-
-megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference
-
-megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
-
-megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
-
-megatron/post_training/ @NVIDIA/post-training
+* @NVIDIA/core-nemo @NVIDIA/core-devtech
 
 megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs
 
 .gitlab/ @NVIDIA/ci
 .github/ @NVIDIA/ci
 .gitlab-ci.yml @NVIDIA/ci
 docker/  @NVIDIA/ci
+tests/unit_tests/run_ci_test.sh @NVIDIA/ci
+tests/test_utils/python_scripts/
 tests/functional_tests/python_test_utils/ @NVIDIA/ci
 tests/functional_tests/shell_test_utils/ @NVIDIA/ci
-tests/test_utils/recipes/ @NVIDIA/ci
-tests/unit_tests/run_ci_test.sh @NVIDIA/ci
-
-# API Backwards Compatibility Check
-scripts/check_api_backwards_compatibility.py @NVIDIA/ci @pablo-garay
-scripts/README_API_COMPAT.md @NVIDIA/ci @pablo-garay
-.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci @pablo-garay
-docs/api-backwards-compatibility-check.md @NVIDIA/ci @pablo-garay
-tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci @pablo-garay
-
-megatron/rl/ @NVIDIA/reinforcement-learning
-examples/rl/ @NVIDIA/reinforcement-learning
-test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
-train_rl.py @NVIDIA/reinforcement-learning
+pyproject.toml @NVIDIA/ci
+uv.lock @NVIDIA/ci
@@ -48,46 +48,16 @@ inputs:
   is_ci_workload:
     description: "Is CI workload"
     required: true
-
+  is_merge_group:
+    description: "Is merge group"
+    required: true
 runs:
   using: "composite"
   steps:
     - name: Print node name
       shell: bash -x -e -u -o pipefail {0}
       run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"
 
-    - name: GPU Sanity Check
-      shell: bash -x -e -u -o pipefail {0}
-      run: |
-        echo "Starting GPU Sanity Check..."
-
-        # 1. Check for active Compute Processes
-        # query-compute-apps returns a list of PIDs using the GPU. If empty, we are good.
-        OPEN_PROCESSES=$(docker run --rm --gpus all ubuntu nvidia-smi --query-compute-apps=pid,process_name --format=csv,noheader)
-
-        if [ -n "$OPEN_PROCESSES" ]; then
-          echo "::error::❌ GPU is not clean! Found active processes:"
-          echo "$OPEN_PROCESSES"
-        else
-          echo "✅ No active compute processes found."
-        fi
-
-        # 2. Check VRAM Usage (Optional but recommended)
-        # We allow a small buffer (e.g., < 300MiB) for driver overhead/Xorg, 
-        # though on headless K8s nodes this should be very close to 0.
-
-        MEMORY_USAGES=$(docker run --rm --gpus all ubuntu nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)
-
-        # Check each GPU visible to the container
-        for MEMORY in $MEMORY_USAGES; do
-          if [ "$MEMORY" -gt 300 ]; then
-            echo "::error::❌ GPU VRAM usage is suspiciously high: ${MEMORY} MiB"
-          fi
-        done
-
-        echo "✅ GPU Memory is clean (all < 300 MiB)."
-        echo "Ready to start workflow."
-
     - name: Checkout repository
       uses: actions/checkout@v2
 
@@ -117,8 +87,10 @@ runs:
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
         export NCCL_DEBUG=INFO
-        pip install --no-cache-dir uv
-        uv sync --only-group test
+        pip install --no-cache-dir "uv<0.9.29"
+        uv venv .venv
+        uv cache clean
+        uv sync --no-cache --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           --scope unit-tests \
           --model unit-tests \
@@ -177,7 +149,12 @@ runs:
         #!/bin/bash
         set -euxo pipefail
 
-        if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
+        if [ "${{ inputs.is_merge_group }}" == "true" ]; then
+          ARGS=(
+            --scope mr-github
+            --n-repeat 1
+          )
+        elif [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
           ARGS=(
             --scope mr-github
             --enable-lightweight-mode
@@ -197,8 +174,10 @@ runs:
 
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
-        pip install --no-cache-dir uv
-        uv sync --only-group test
+        pip install --no-cache-dir "uv<0.9.29"
+        uv venv .venv
+        uv cache clean
+        uv sync --no-cache --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           ${ARGS[@]} \
           --model ${{ inputs.model }} \

@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
@@ -1,18 +1,6 @@
 [
-    {
-        "user": "dimapihtar",
-        "date": "2026-01-28"
-    },
-    {
-        "user": "gautham-kollu",
-        "date": "2026-02-04"
-    },
     {
         "user": "janEbert",
-        "date": "2026-02-11"
-    },
-    {
-        "user": "Phlip79",
         "date": "2026-02-18"
     },
     {
@@ -46,5 +34,17 @@
     {
         "user": "BoxiangW",
         "date": "2026-04-15"
+    },
+    {
+        "user": "Phlip79",
+        "date": "2026-04-22"
+    },
+    {
+        "user": "asolergi-nv",
+        "date": "2026-04-29"
+    },
+    {
+        "user": "dimapihtar",
+        "date": "2026-05-06"
     }
 ]
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+cat << 'EOF'
+╔══════════════════════════════════════════════════════════════════════╗
+║                                                                      ║
+║    ███╗   ███╗██████╗ ██████╗ ██╗██████╗  ██████╗ ███████╗         ║
+║    ████╗ ████║██╔══██╗██╔══██╗██║██╔══██╗██╔════╝ ██╔════╝         ║
+║    ██╔████╔██║██████╔╝██████╔╝██║██║  ██║██║  ███╗█████╗           ║
+║    ██║╚██╔╝██║██╔══██╗██╔══██╗██║██║  ██║██║   ██║██╔══╝           ║
+║    ██║ ╚═╝ ██║██████╔╝██║  ██║██║██████╔╝╚██████╔╝███████╗         ║
+║    ╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝╚═╝╚═════╝  ╚═════╝ ╚══════╝         ║
+║                                                                      ║
+║              H O W   T O :   M B R I D G E   T E S T I N G         ║
+╚══════════════════════════════════════════════════════════════════════╝
+
+  MBridge unit tests run automatically on every PR. To also trigger
+  functional tests, attach the label and re-run the workflow step.
+
+  ┌─────────────────────────────────────────────────────────────────┐
+  │  DEFAULT  │  Unit tests run on every PR (no action needed)      │
+  ├─────────────────────────────────────────────────────────────────┤
+  │                                                                  │
+  │    Every PR  ──►  cicd-mbridge-testing  ──►  unit tests only   │
+  │                                                                  │
+  └─────────────────────────────────────────────────────────────────┘
+
+  ┌─────────────────────────────────────────────────────────────────┐
+  │  STEP 1  │  Attach the label to your PR (for functional tests)  │
+  ├─────────────────────────────────────────────────────────────────┤
+  │                                                                  │
+  │    PR Labels  ──►  [ + Add label ]  ──►  "Run MBridge tests"   │
+  │                                                                  │
+  └─────────────────────────────────────────────────────────────────┘
+
+  ┌─────────────────────────────────────────────────────────────────┐
+  │  STEP 2  │  Re-run this workflow step                           │
+  ├─────────────────────────────────────────────────────────────────┤
+  │                                                                  │
+  │    Actions  ──►  [ Re-run jobs ]  ──►  Re-run failed jobs      │
+  │                                                                  │
+  └─────────────────────────────────────────────────────────────────┘
+
+  ┌─────────────────────────────────────────────────────────────────┐
+  │  RESULT  │  Unit + functional tests run!                        │
+  ├─────────────────────────────────────────────────────────────────┤
+  │                                                                  │
+  │         cicd-mbridge-testing  ◄── unit + functional tests      │
+  │                                                                  │
+  │         Tests run against MBridge using the merge commit       │
+  │         SHA of your pull request.                              │
+  │                                                                  │
+  └─────────────────────────────────────────────────────────────────┘
+
+                ┌────────────────────────────────────┐
+                │  Label present?     NO   → unit    │
+                │  Label present?     YES  → unit +  │
+                │                           functional│
+                └────────────────────────────────────┘
+
+  NOTE: The label must be present BEFORE the re-run is triggered.
+        The CI checks for "Run MBridge tests" at runtime.
+
+  NOTE: All MBridge test results are optional — failures do not
+        block merging your PR.
+EOF
@@ -17,8 +17,6 @@ on:
         type: boolean
         default: true
     secrets:
-      TWINE_USERNAME:
-        required: true
       TWINE_PASSWORD:
         required: true
 
@@ -147,7 +145,6 @@ jobs:
     needs: [build-and-test-wheels]
     runs-on: ubuntu-latest
     if: inputs.no-publish == false
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
     strategy:
       fail-fast: false
       matrix:
@@ -170,7 +167,7 @@ jobs:
 
       - name: Publish wheels
         env:
-          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_USERNAME: __token__
           TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
           TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
           PLATFORM: ${{ matrix.PLATFORM }}