Merge branch 'main' into chenhany/fix_eagle3_multi_layer_hook

ChenhanYu · web-flow · commit a6d0dec89d79 · 2025-09-05T14:22:31.000-07:00
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -22,20 +22,31 @@ jobs:
       any_changed: ${{ steps.changed-tests.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - id: get-pr-info
         uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
       - name: Check for changes in test-relevant directories
         id: changed-tests
         uses: step-security/changed-files@v46.0.5
         with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files: |
             .github/workflows/gpu_tests.yml
             modelopt/**
             tests/gpu/**
             tox.ini
             pyproject.toml
             setup.py
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
+          fail_on_initial_diff_error: true
   wait-checks:
     needs: [check-file-changes]
     if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
     timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
+  gpu-pr-required-check:
+    # Run even if gpu-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, gpu-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
+        run: exit 1
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -126,3 +126,9 @@ jobs:
           python-version: "3.12"
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
+  unit-pr-required-check:
+    if: github.event_name == 'pull_request'
+    needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All PR unit test jobs completed"
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -59,38 +59,6 @@
 }
 
 
-def get_supported_models():
-    """Get the supported models for Minitron pruning.
-
-    NOTE: Keep inside function to avoid circular import issues.
-    """
-    supported_models = set()
-
-    try:
-        from megatron.core.models.gpt import GPTModel
-
-        supported_models.add(GPTModel)
-    except Exception:
-        pass
-
-    try:
-        from megatron.core.models.mamba import MambaModel
-
-        supported_models.add(MambaModel)
-    except Exception:
-        pass
-
-    try:
-        from nemo.collections import llm
-
-        # NOTE: llm.MambaModel is a subclass of llm.GPTModel
-        supported_models.add(llm.GPTModel)
-    except Exception:
-        pass
-
-    return supported_models
-
-
 class MCoreMinitronSearcher(BaseSearcher):
     """Searcher for Minitron pruning algorithm."""
 
@@ -158,17 +126,6 @@ def before_search(self) -> None:
     def run_search(self) -> None:
         """Run actual search."""
         # Run forward loop to collect activations and sort parameters
-        model_cfg = None
-        supported_models = get_supported_models()
-        for m_type in supported_models:
-            if isinstance(self.model, m_type):
-                model_cfg = self.model.config
-                break
-        if model_cfg is None:
-            raise NotImplementedError(
-                f"Only {supported_models} models are supported! Got: {type(self.model)}"
-            )
-
         assert self.forward_loop is not None
         is_training = self.model.training
         self.model.eval()
@@ -187,6 +144,7 @@ def run_search(self) -> None:
                 hp.active = export_config[hp_name]
 
         # kv_channels can be None so we need to save original from original hidden_size and num_attention_heads
+        model_cfg = self.model.config
         orig_kv_channels = getattr(model_cfg, "kv_channels")
         if orig_kv_channels is None:
             orig_kv_channels = getattr(model_cfg, "hidden_size") // getattr(
diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py
@@ -90,7 +90,7 @@ def dict_to_config(
         fp16=fp16,
         bf16=bf16,
         params_dtype=getattr(torch, architecture_config["torch_dtype"]),
-        pipeline_dtype=None,
+        pipeline_dtype=getattr(torch, architecture_config["torch_dtype"]),
         num_layers=architecture_config.get("num_hidden_layers"),
         hidden_size=architecture_config.get("hidden_size"),
         ffn_hidden_size=architecture_config.get("intermediate_size"),
diff --git a/tests/gpu/torch/quantization/backends/test_gemm_common.py b/tests/gpu/torch/quantization/backends/test_gemm_common.py
@@ -29,6 +29,12 @@
 set_seed()
 
 
+@pytest.fixture(autouse=True)
+def setup_seed():
+    """Set seed before each test function."""
+    set_seed()
+
+
 @pytest.mark.parametrize(
     ("config", "gemm_forward", "atol", "rtol"),
     [
@@ -257,9 +263,9 @@ def forward_loop(model, run_backward=False):
 
         # The way the compression of the weights and inputs might be different.
         # E.g. we may use torch.compile in the gemms.
-        assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 3)
-        assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 3)
+        assert torch.allclose(output_dynamic_quant_gemm, output_dynamic_quant, atol=atol / 2)
+        assert torch.allclose(output_calib_quant_gemm, output_calib_quant, atol=atol / 2)
         assert torch.allclose(
-            output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 3
+            output_dynamic_quant_gemm, output_dynamic_quant_compressed, atol=atol / 2
         )
-        assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 3)
+        assert torch.allclose(output_calib_quant_gemm, output_calib_quant_compressed, atol=atol / 2)