Merge branch 'main' into cjluo-nv-patch-1

cjluo-nv · web-flow · commit ab4e6d91d8c6 · 2025-09-08T08:52:10.000-07:00
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 # Examples
 /docker @NVIDIA/modelopt-docker-codeowners
 /README.md @NVIDIA/modelopt-examples-codeowners
-examples @NVIDIA/modelopt-examples-codeowners
-examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
-examples/deepseek @NVIDIA/modelopt-deploy-codeowners
-examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
-examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
-examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
-examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
-examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
-examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
-examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
-examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
-examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
-examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
-examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
-examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
-examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
-examples/windows @NVIDIA/modelopt-windows-codeowners
+/examples @NVIDIA/modelopt-examples-codeowners
+/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
+/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
+/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
+/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
+/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
+/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
+/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
+/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
+/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
+/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
+/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
+/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/windows @NVIDIA/modelopt-windows-codeowners
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -22,20 +22,31 @@ jobs:
       any_changed: ${{ steps.changed-tests.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - id: get-pr-info
         uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
       - name: Check for changes in test-relevant directories
         id: changed-tests
         uses: step-security/changed-files@v46.0.5
         with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files: |
             .github/workflows/gpu_tests.yml
             modelopt/**
             tests/gpu/**
             tox.ini
             pyproject.toml
             setup.py
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
+          fail_on_initial_diff_error: true
   wait-checks:
     needs: [check-file-changes]
     if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
     timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
+  gpu-pr-required-check:
+    # Run even if gpu-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, gpu-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
+        run: exit 1
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -4,13 +4,6 @@ name: Unit tests
 on:
   pull_request:
     branches: [main, release/*]
-    paths:
-      - ".github/workflows/unit_tests.yml"
-      - "modelopt/**"
-      - "tests/unit/**"
-      - "pyproject.toml"
-      - "setup.py"
-      - "tox.ini"
   push:
     branches: [main, release/*]
     paths:
@@ -126,3 +119,9 @@ jobs:
           python-version: "3.12"
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
+  unit-pr-required-check:
+    if: github.event_name == 'pull_request'
+    needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All PR unit test jobs completed"
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
@@ -790,8 +790,10 @@ def remove_input_dq_and_output_q(
                             if cons_idx in quantizable_custom_ops[consumer.op_type]["inp"]:
                                 consumer.input[cons_idx] = q_node.output[0]
                             else:
-                                q_node_prev = tensor_producers[q_node.input[0]]
-                                consumer.input[cons_idx] = q_node_prev.output[0]
+                                q_node_prev = tensor_producers.get(q_node.input[0], None)
+                                consumer.input[cons_idx] = (
+                                    q_node_prev.output[0] if q_node_prev else q_node.input[0]
+                                )
                             break
 
                 # Track DequantizeLinear node indices for cleanup
@@ -828,8 +830,11 @@ def remove_input_dq_and_output_q(
                 if quantizable_custom_ops[producer.op_type]["out"]:
                     dq_node[0].input[0] = producer.output[0]
                 else:
-                    dq_node_next = tensor_consumers[dq_node[0].output[0]]
-                    dq_node_next[0].input[0] = producer.output[0]
+                    dq_node_next = tensor_consumers.get(dq_node[0].output[0], None)
+                    if dq_node_next:
+                        dq_node_next[0].input[0] = producer.output[0]
+                    else:
+                        dq_node[0].input[0] = producer.output[0]
 
                 # Track QuantizeLinear node indices for cleanup
                 q_indices.append(node_idx)
diff --git a/modelopt/onnx/trt_utils.py b/modelopt/onnx/trt_utils.py
@@ -416,9 +416,10 @@ def interpret_trt_plugins_precision_flag(
             # Will add Q/DQ nodes in the requested I/O indices
             inp_precision_quant = [i for i, p in enumerate(inp_precision) if p in ["int8", "fp8"]]
             out_precision_quant = [i for i, p in enumerate(out_precision) if p in ["int8", "fp8"]]
-            custom_ops_to_quantize[op_type] = {
-                "inp": inp_precision_quant,
-                "out": out_precision_quant,
-            }
+            if inp_precision_quant or out_precision_quant:
+                custom_ops_to_quantize[op_type] = {
+                    "inp": inp_precision_quant,
+                    "out": out_precision_quant,
+                }
 
     return custom_ops_to_cast, custom_ops_to_quantize
diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py
@@ -524,9 +524,11 @@ def __init__(
         if self._num_aux_hidden_states > 0:
             # Register forward hook to the last EAGLE3 layer to extract the pre-norm hidden_state
             # for eagle3 auto regression.
-            layer = self.decoder.layers[-1]
-            layer.register_forward_hook(self._eagle3_layer_forward_hook)
+            last_layer = self.decoder.layers[-1]
+            last_layer.register_forward_hook(self._eagle3_layer_forward_hook)
 
+            # The first EAGLE3 layer needs to be specialized.
+            layer = self.decoder.layers[0]
             self_attention = layer.self_attention
             if not isinstance(self_attention, SelfAttention):
                 raise ValueError("EAGLE-3 only support SelfAttention (MHA, GQA).")
diff --git a/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py
@@ -32,6 +32,12 @@
 from modelopt.torch.speculative.plugins.megatron_medusa import _DynamicMedusaGPTModel
 from modelopt.torch.speculative.utils import Tree, get_default_attention_mask_and_position_ids
 
+ALGO_TO_CONFIG = {
+    "eagle1": mtsp.config.EAGLE1_DEFAULT_CFG,
+    "eagle3": mtsp.config.EAGLE3_DEFAULT_CFG,
+    "eagle-mtp": mtsp.config.EAGLE_MTP_DEFAULT_CFG,
+}
+
 
 def _test_speculative_gpt_model(
     algo, num_medusa_heads_or_eagle_layers, activation_func, normalization, rank, size
@@ -64,18 +70,42 @@ def _test_speculative_gpt_model(
 
         # Type checking
         assert isinstance(model, _DynamicMedusaGPTModel)
-    elif algo == "eagle":
-        config = {"eagle_architecture_config": deepcopy(default_eagle_config)}
-        config["eagle_architecture_config"]["hidden_size"] = model.config.hidden_size
-        config["eagle_architecture_config"]["vocab_size"] = model.vocab_size
-        config["eagle_architecture_config"]["draft_vocab_size"] = model.vocab_size
+    elif algo in {"eagle1", "eagle3"}:
+        mtsp_config = ALGO_TO_CONFIG[algo]
+
+        mtsp_config["config"]["eagle_architecture_config"]["num_hidden_layers"] = (
+            num_medusa_heads_or_eagle_layers
+        )
+        mtsp_config["config"]["eagle_architecture_config"]["hidden_size"] = model.config.hidden_size
+        mtsp_config["config"]["eagle_architecture_config"]["vocab_size"] = model.vocab_size
+        mtsp_config["config"]["eagle_architecture_config"]["draft_vocab_size"] = model.vocab_size
 
-        model = mtsp.convert(model, [("eagle", config)])
+        model = mtsp.convert(model, mtsp_config)
 
         # Type checking
         assert isinstance(model, _DynamicEagleGPTModel)
     else:
-        raise ValueError("Only algo={eagle, medusa} are supported!")
+        raise ValueError("Only algo={eagle1, eagle3, medusa} are supported!")
+
+    if algo == "eagle3":
+        first_layer = model.eagle_module.decoder.layers[0]
+        last_layer = model.eagle_module.decoder.layers[-1]
+        # Eagle3 QKV input_dim is 2x of hidden_size
+        assert (
+            first_layer.self_attention.linear_qkv.weight.shape[-1] == model.config.hidden_size * 2
+        )
+        # Eagle3 attention has a forward_pre_hook to handle additional features to be concatenated
+        assert len(first_layer.self_attention._forward_pre_hooks) > 0
+        # Eagle3 last layer has a forward hook to extrat the pre_norm hidden_state
+        assert len(last_layer._forward_hooks) > 0
+    elif algo == "eagle1":
+        first_layer = model.eagle_module.decoder.layers[0]
+        last_layer = model.eagle_module.decoder.layers[-1]
+        # Eagle1 QKV input_dim the same as hidden_size
+        assert first_layer.self_attention.linear_qkv.weight.shape[-1] == model.config.hidden_size
+        # No forward_hook or forward_pre_hook are needed
+        assert len(first_layer.self_attention._forward_pre_hooks) == 0
+        assert len(last_layer._forward_hooks) == 0
 
     # Bfloat16
     model = model.to(torch.bfloat16)
@@ -104,7 +134,7 @@ def _test_speculative_gpt_model(
 
         assert medusa_loss.shape[0] == batch_size
         assert medusa_loss.shape[1] == max_sequence_length
-    elif algo == "eagle":
+    elif algo in {"eagle1", "eagle3"}:
         labels = torch.randint(0, vocab_size, (batch_size, max_sequence_length)).cuda()
         eagle_loss = model(prompt_tokens, position_ids, attention_mask, labels=labels)
 
@@ -115,8 +145,10 @@ def _test_speculative_gpt_model(
 @pytest.mark.parametrize(
     ("algo", "num_medusa_heads_or_eagle_layers", "activation_func", "normalization"),
     [
-        ("eagle", 1, "squared_relu", "LayerNorm"),  # MHA
-        ("eagle", 2, "swiglu", "RMSNorm"),  # GQA
+        ("eagle1", 1, "squared_relu", "LayerNorm"),  # MHA
+        ("eagle1", 2, "swiglu", "RMSNorm"),  # GQA
+        ("eagle3", 1, "swiglu", "RMSNorm"),  # GQA
+        ("eagle3", 2, "swiglu", "RMSNorm"),  # GQA
         ("medusa", 1, "squared_relu", "LayerNorm"),  # MHA
         ("medusa", 2, "swiglu", "RMSNorm"),  # GQA
     ],