vllm-project · rahul-tuli · Mar 19, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml
@@ -6,20 +6,6 @@ sparsity_stage:
       mask_structure: "2:4"
       targets: ["Linear"]
       ignore: ["re:.*lm_head"]
-finetuning_stage:
-  run_type: train
-  finetuning_modifiers:
-    ConstantPruningModifier:
-      targets: [
-        're:.*q_proj.weight',
-        're:.*k_proj.weight', 
-        're:.*v_proj.weight',
-        're:.*o_proj.weight',
-        're:.*gate_proj.weight',
-        're:.*up_proj.weight',
-        're:.*down_proj.weight',
-      ]
-      start: 0
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml
@@ -6,20 +6,6 @@ sparsity_stage:
       mask_structure: "2:4"
       targets: ["Linear"]
       ignore: ["re:.*lm_head"]
-finetuning_stage:
-  run_type: train
-  finetuning_modifiers:
-    ConstantPruningModifier:
-      targets: [
-        're:.*q_proj.weight',
-        're:.*k_proj.weight', 
-        're:.*v_proj.weight',
-        're:.*o_proj.weight',
-        're:.*gate_proj.weight',
-        're:.*up_proj.weight',
-        're:.*down_proj.weight',
-      ]
-      start: 0
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/sparse_2of4_quantization_fp8/README.md
@@ -63,21 +63,13 @@ recipe = [
 ]
 
 if fp8_enabled:
-    recipe.extend([
+    recipe.append(
         QuantizationModifier(
             targets=["Linear"],
             ignore=["lm_head"],
             scheme="FP8_DYNAMIC",
         ),
-        ConstantPruningModifier(
-            targets=[
-                r"re:.*q_proj.weight", r"re:.*k_proj.weight", r"re:.*v_proj.weight",
-                r"re:.*o_proj.weight", r"re:.*gate_proj.weight", r"re:.*up_proj.weight",
-                r"re:.*down_proj.weight",
-            ],
-            start=0,
-        ),
-    ])
+    )
 ```
 
 2. **Apply Compression**

diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -5,7 +5,6 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
-from llmcompressor.modifiers.pruning import ConstantPruningModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Configuration
@@ -52,29 +51,23 @@ def get_recipe(fp8_enabled):
     save_dir = MODEL_ID.split("/")[1] + "2of4-sparse"
 
     if fp8_enabled:
-        base_recipe.extend(
-            [
-                QuantizationModifier(
-                    targets=["Linear"],
-                    ignore=["lm_head"],
-                    scheme="FP8_DYNAMIC",
-                ),
-                ConstantPruningModifier(
-                    targets=[
-                        r"re:.*q_proj.weight",
-                        r"re:.*k_proj.weight",
-                        r"re:.*v_proj.weight",
-                        r"re:.*o_proj.weight",
-                        r"re:.*gate_proj.weight",
-                        r"re:.*up_proj.weight",
-                        r"re:.*down_proj.weight",
-                    ],
-                    start=0,
-                ),
-            ]
+        base_recipe.append(
+            QuantizationModifier(
+                targets=["Linear"],
+                ignore=["lm_head"],
+                scheme="FP8_DYNAMIC",
+            )
         )
         save_dir = MODEL_ID.split("/")[1] + "2of4-W8A8-FP8-Dynamic-Per-Token"
 
+    # check that asymmetric quantization is not being used
+    q_scheme = base_recipe[1].scheme
+    if not isinstance(q_scheme, str) and not q_scheme["weights"].symmetric:
+        raise ValueError(
+            "Asymmetric quantization with 2of4 sparsity is not supported by vLLM. "
+            "Please use symmetric quantization"
+        )
+
     return base_recipe, save_dir
 
 

diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
@@ -9,17 +9,6 @@ sparsity_stage:
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:
-    ConstantPruningModifier:
-      targets: [
-        're:.*q_proj.weight',
-        're:.*k_proj.weight', 
-        're:.*v_proj.weight',
-        're:.*o_proj.weight',
-        're:.*gate_proj.weight',
-        're:.*up_proj.weight',
-        're:.*down_proj.weight',
-      ]
-      start: 0
     QuantizationModifier:
       targets: ["Linear"]
       ignore: ["lm_head"]

diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py
@@ -74,7 +74,6 @@ def test_constant_pruning_modifier_e2e(model, optimizer):
     state.update(
         model=model,
         optimizer=optimizer(model.parameters(), lr=0.1),
-        start=0,
     )
     modifier = ConstantPruningModifier(
         targets="__ALL_PRUNABLE__",