[BugFix] Fix fusion test and add them to CI (vllm-project#16287)

ProExpertProg · web-flow · commit 9cdde47289f6 · 2025-04-08T23:46:45.000-07:00
Signed-off-by: luka &lt;luka@neuralmagic.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -292,6 +292,14 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
+- label: PyTorch Compilation Unit Tests
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+
 - label: PyTorch Fullgraph Smoke Test # 9min
   source_file_dependencies:
   - vllm/
@@ -301,7 +309,6 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/test_pass_manager.py
 
 - label: PyTorch Fullgraph Test # 18min
   source_file_dependencies:
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import pytest
 import torch
@@ -15,7 +15,7 @@
 from ..utils import create_new_process_for_each_test
 
 
-def models_list(all: bool):
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -32,47 +32,50 @@ def models_list(all: bool):
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
-    if not all:
-        return TEST_MODELS
+    if all:
+        if is_quant_method_supported("aqlm"):
+            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+                "quantization": "aqlm"
+            }))
+
+        # TODO: figure out why this fails.
+        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+                "quantization": "gguf"
+            }))
+
+        if is_quant_method_supported("gptq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+                "quantization": "gptq"
+            }))
+
+        if is_quant_method_supported("gptq_marlin"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+                "quantization": "gptq_marlin"
+            }))
 
-    if is_quant_method_supported("aqlm"):
-        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-            "quantization": "aqlm"
-        }))
-
-    # TODO: figure out why this fails.
-    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-            "quantization": "gguf"
-        }))
-
-    if is_quant_method_supported("gptq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-            "quantization": "gptq"
-        }))
-
-    if is_quant_method_supported("gptq_marlin"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-            "quantization": "gptq_marlin"
-        }))
-
-    if is_quant_method_supported("gptq_marlin_24"):
-        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-            "quantization": "gptq_marlin_24"
-        }))
-
-    if is_quant_method_supported("marlin"):
-        TEST_MODELS.append(
-            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                "quantization": "marlin"
+        if is_quant_method_supported("gptq_marlin_24"):
+            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+                "quantization": "gptq_marlin_24"
             }))
 
-    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-            "quantization": "AWQ"
-        }))
+        if is_quant_method_supported("marlin"):
+            TEST_MODELS.append(
+                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                    "quantization": "marlin"
+                }))
 
-    return TEST_MODELS
+        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+                "quantization": "AWQ"
+            }))
+
+    if keywords is None:
+        return TEST_MODELS
+
+    # filter by keywords
+    pred = lambda model: any(keyword in model[0] for keyword in keywords)
+    return list(filter(pred, TEST_MODELS))
 
 
 @pytest.mark.parametrize(
@@ -96,20 +99,30 @@ def test_full_graph(
         run_model(optimization_level, model, model_kwargs)
 
 
+PassConfig = CompilationConfig.PassConfig
+
+
 # TODO(luka) add other supported compilation config scenarios here
 @pytest.mark.parametrize(
-    "compilation_config",
-    # additional compile sizes
+    "compilation_config, model_info",
     [
-        CompilationConfig(level=CompilationLevel.PIECEWISE,
-                          compile_sizes=[1, 2])
+        # additional compile sizes, only some of the models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           compile_sizes=[1, 2]), model)
+        for model in models_list(all=False)
+    ] + [
+        # RMSNorm + quant fusion, only 8-bit quant models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           custom_ops=["+rms_norm"],
+                           pass_config=PassConfig(enable_fusion=True,
+                                                  enable_noop=True)), model)
+        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
     ])
 # only test some of the models
-@pytest.mark.parametrize("model_info", models_list(all=False))
 @create_new_process_for_each_test()
 def test_custom_compile_config(
-    model_info: tuple[str, dict[str, Any]],
     compilation_config: CompilationConfig,
+    model_info: tuple[str, dict[str, Any]],
 ):
     model, model_kwargs = model_info
     print(f"MODEL={model}")
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -44,12 +44,17 @@ def forward(self, x):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
+        x2 = self.fp8_linear.apply(y,
+                                   self.w[0],
+                                   self.wscale[0],
+                                   input_scale=self.scale[0])
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
-                                   self.scale[1])
+        x3 = self.fp8_linear.apply(y2,
+                                   self.w[1],
+                                   self.wscale[1],
+                                   input_scale=self.scale[1])
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3