vllm-project · rahul-tuli · Mar 19, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -5,7 +5,6 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
-from llmcompressor.modifiers.pruning import ConstantPruningModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Configuration
@@ -52,29 +51,23 @@ def get_recipe(fp8_enabled):
     save_dir = MODEL_ID.split("/")[1] + "2of4-sparse"
 
     if fp8_enabled:
-        base_recipe.extend(
-            [
-                QuantizationModifier(
-                    targets=["Linear"],
-                    ignore=["lm_head"],
-                    scheme="FP8_DYNAMIC",
-                ),
-                ConstantPruningModifier(
-                    targets=[
-                        r"re:.*q_proj.weight",
-                        r"re:.*k_proj.weight",
-                        r"re:.*v_proj.weight",
-                        r"re:.*o_proj.weight",
-                        r"re:.*gate_proj.weight",
-                        r"re:.*up_proj.weight",
-                        r"re:.*down_proj.weight",
-                    ],
-                    start=0,
-                ),
-            ]
+        base_recipe.append(
+            QuantizationModifier(
+                targets=["Linear"],
+                ignore=["lm_head"],
+                scheme="FP8_DYNAMIC",
+            )
         )
         save_dir = MODEL_ID.split("/")[1] + "2of4-W8A8-FP8-Dynamic-Per-Token"
 
+    # check that asymmetric quantization is not being used
+    q_scheme = base_recipe[1].scheme
+    if not isinstance(q_scheme, str) and not q_scheme["weights"].symmetric:
+        raise ValueError(
+            "Asymmetric quantization with 2of4 sparsity is not supported by vLLM. "
+            "Please use symmetric quantization"
+        )
+
     return base_recipe, save_dir
 
 

diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py
@@ -74,7 +74,6 @@ def test_constant_pruning_modifier_e2e(model, optimizer):
     state.update(
         model=model,
         optimizer=optimizer(model.parameters(), lr=0.1),
-        start=0,
     )
     modifier = ConstantPruningModifier(
         targets="__ALL_PRUNABLE__",