diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml index e59cf8a96f..92cc85ae73 100644 --- a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml +++ b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml @@ -6,20 +6,6 @@ sparsity_stage: mask_structure: "2:4" targets: ["Linear"] ignore: ["re:.*lm_head"] -finetuning_stage: - run_type: train - finetuning_modifiers: - ConstantPruningModifier: - targets: [ - 're:.*q_proj.weight', - 're:.*k_proj.weight', - 're:.*v_proj.weight', - 're:.*o_proj.weight', - 're:.*gate_proj.weight', - 're:.*up_proj.weight', - 're:.*down_proj.weight', - ] - start: 0 quantization_stage: run_type: oneshot quantization_modifiers: diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml index 4ff5ff26e2..dc7e18b6e7 100644 --- a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml +++ b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml @@ -6,20 +6,6 @@ sparsity_stage: mask_structure: "2:4" targets: ["Linear"] ignore: ["re:.*lm_head"] -finetuning_stage: - run_type: train - finetuning_modifiers: - ConstantPruningModifier: - targets: [ - 're:.*q_proj.weight', - 're:.*k_proj.weight', - 're:.*v_proj.weight', - 're:.*o_proj.weight', - 're:.*gate_proj.weight', - 're:.*up_proj.weight', - 're:.*down_proj.weight', - ] - start: 0 quantization_stage: run_type: oneshot quantization_modifiers: diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/sparse_2of4_quantization_fp8/README.md index 99fc3c5455..75dd38f2e0 100644 --- a/examples/sparse_2of4_quantization_fp8/README.md +++ b/examples/sparse_2of4_quantization_fp8/README.md @@ -63,21 +63,13 @@ recipe = [ ] if fp8_enabled: - recipe.extend([ + recipe.append( QuantizationModifier( targets=["Linear"], ignore=["lm_head"], scheme="FP8_DYNAMIC", ), - ConstantPruningModifier( - targets=[ - r"re:.*q_proj.weight", r"re:.*k_proj.weight", r"re:.*v_proj.weight", - r"re:.*o_proj.weight", r"re:.*gate_proj.weight", r"re:.*up_proj.weight", - r"re:.*down_proj.weight", - ], - start=0, - ), - ]) + ) ``` 2. **Apply Compression** diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index ad878a3cef..872f2fa77f 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -5,7 +5,6 @@ from llmcompressor import oneshot from llmcompressor.modifiers.obcq import SparseGPTModifier -from llmcompressor.modifiers.pruning import ConstantPruningModifier from llmcompressor.modifiers.quantization import QuantizationModifier # Configuration @@ -52,29 +51,23 @@ def get_recipe(fp8_enabled): save_dir = MODEL_ID.split("/")[1] + "2of4-sparse" if fp8_enabled: - base_recipe.extend( - [ - QuantizationModifier( - targets=["Linear"], - ignore=["lm_head"], - scheme="FP8_DYNAMIC", - ), - ConstantPruningModifier( - targets=[ - r"re:.*q_proj.weight", - r"re:.*k_proj.weight", - r"re:.*v_proj.weight", - r"re:.*o_proj.weight", - r"re:.*gate_proj.weight", - r"re:.*up_proj.weight", - r"re:.*down_proj.weight", - ], - start=0, - ), - ] + base_recipe.append( + QuantizationModifier( + targets=["Linear"], + ignore=["lm_head"], + scheme="FP8_DYNAMIC", + ) ) save_dir = MODEL_ID.split("/")[1] + "2of4-W8A8-FP8-Dynamic-Per-Token" + # check that asymmetric quantization is not being used + q_scheme = base_recipe[1].scheme + if not isinstance(q_scheme, str) and not q_scheme["weights"].symmetric: + raise ValueError( + "Asymmetric quantization with 2of4 sparsity is not supported by vLLM. " + "Please use symmetric quantization" + ) + return base_recipe, save_dir diff --git a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml index 93ba938675..ebf2d9d4b7 100644 --- a/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml +++ b/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml @@ -9,17 +9,6 @@ sparsity_stage: quantization_stage: run_type: oneshot quantization_modifiers: - ConstantPruningModifier: - targets: [ - 're:.*q_proj.weight', - 're:.*k_proj.weight', - 're:.*v_proj.weight', - 're:.*o_proj.weight', - 're:.*gate_proj.weight', - 're:.*up_proj.weight', - 're:.*down_proj.weight', - ] - start: 0 QuantizationModifier: targets: ["Linear"] ignore: ["lm_head"] diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py index 433e9ae8d9..11ee61598f 100644 --- a/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py +++ b/tests/llmcompressor/pytorch/modifiers/pruning/constant/test_pytorch.py @@ -74,7 +74,6 @@ def test_constant_pruning_modifier_e2e(model, optimizer): state.update( model=model, optimizer=optimizer(model.parameters(), lr=0.1), - start=0, ) modifier = ConstantPruningModifier( targets="__ALL_PRUNABLE__",