Support DeepSeekV3-style block FP8 quantization (clean) (vllm-project#1675)

shanjiaz · mgoin · kylesayrs · web-flow · commit 14a244a82b8d · 2025-07-24T10:15:57.000-04:00
SUMMARY: Fixes [1475](vllm-project#1475) This was originally pr [vllm-project#1607](vllm-project#1607), the commit history got messy. I cherry picked Michael's original commit 451219a and updated from there. TEST PLAN: Tested locally and generated the model. --------- Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: shanjiaz <zsjwpianpian@gmail.com> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/docs/guides/compression_schemes.md b/docs/guides/compression_schemes.md
@@ -19,6 +19,9 @@ PTQ is performed to reduce the precision of quantizable weights (e.g., linear la
 - Useful for speed ups in high QPS regimes or offline serving on vLLM. 
 - Recommended for NVIDIA GPUs with compute capability >=9.0 (Hopper and Blackwell).
 
+### [W8A8-FP8_BLOCK](../examples/quantization_w8a8_fp8/fp8_block_example.py)
+- Uses block-wise quantization to compress weights to FP8 in (commonly 128×128 tiles), and dynamic per-token-group (128) quantization for activations. Does not require calibration dataset. Activation quantization is carried out during inference on vLLM.
+
 ## Sparsification
 Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include:
 
diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py
@@ -0,0 +1,35 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+MODEL_ID = "Qwen/Qwen3-0.6B"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per channel via ptq
+#   * quantize the activations to fp8 with dynamic per token
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-BLOCK"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -10,7 +10,7 @@
 )
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
-from compressed_tensors.utils import align_module_device, update_parameter_data
+from compressed_tensors.utils import align_module_device, update_offload_parameter
 from loguru import logger
 from torch.nn import Module
 
@@ -116,16 +116,19 @@ def call_observer(
                 value,
                 should_calculate_gparam=True,
             )
-            update_parameter_data(module, global_scale, f"{base_name}_global_scale")
+            update_offload_parameter(module, f"{base_name}_global_scale", global_scale)
         else:
             global_scale = getattr(module, f"{base_name}_global_scale", None)
 
         if should_calculate_qparams:
             updated_scale, updated_zero_point = observer(
                 value, g_idx=g_idx, global_scale=global_scale
             )
-            update_parameter_data(module, updated_scale, f"{base_name}_scale")
-            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
+            # register or update scale & zero_point parameters (supports block shapes)
+            scale_name = f"{base_name}_scale"
+            zp_name = f"{base_name}_zero_point"
+            update_offload_parameter(module, scale_name, updated_scale)
+            update_offload_parameter(module, zp_name, updated_zero_point)
 
 
 def update_weight_global_scale(module: Module):
@@ -256,8 +259,8 @@ def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Te
     kv_cache = getattr(module, "kv_cache")
     k_scale = kv_cache.k_scales[module.layer_idx]
     v_scale = kv_cache.v_scales[module.layer_idx]
-    update_parameter_data(module, k_scale, KVCacheScaleType.KEY.value)
-    update_parameter_data(module, v_scale, KVCacheScaleType.VALUE.value)
+    update_offload_parameter(module, KVCacheScaleType.KEY.value, k_scale)
+    update_offload_parameter(module, KVCacheScaleType.VALUE.value, v_scale)
 
 
 def initialize_quantized_kv_cache(module: Module):
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -193,13 +193,52 @@ def get_qparams(
                 )
 
             elif self.quantization_args.strategy == QuantizationStrategy.BLOCK:
-                # TODO (#1475) add support for block-wise quantization
-                raise NotImplementedError(
-                    "Block-wise quantization is not yet supported, "
-                    "consider group-wise quantization instead. More info at "
-                    "https://github.com/vllm-project/llm-compressor/issues/1475"
+                # Block-wise quantization: one scale/zero_point per block of shape
+                # [block_rows, block_cols]
+                rows, cols = observed.shape[:2]
+                bs = self.quantization_args.block_structure
+                if not (
+                    isinstance(bs, (list, tuple))
+                    and len(bs) == 2
+                    and all(isinstance(x, int) for x in bs)
+                ):
+                    raise ValueError(
+                        f"Invalid block_structure '{bs}'. "
+                        f"Must be a list of two ints [rows, cols]."
+                    )
+                block_rows, block_cols = bs
+                num_br = int(ceil(rows / block_rows))
+                num_bc = int(ceil(cols / block_cols))
+
+                # allocate per-block scale and zero_point
+                self._scale = torch.empty(
+                    (num_br, num_bc), dtype=observed.dtype, device=observed.device
+                )
+
+                # Use same dtype logic as GROUP strategy for zero_point
+                if is_fp4(quantization_args=self.quantization_args):
+                    zp_dtype = FP8_E4M3_DATA.dtype
+                else:
+                    zp_dtype = self.quantization_args.pytorch_dtype()
+
+                self._zero_point = torch.empty(
+                    (num_br, num_bc), dtype=zp_dtype, device=observed.device
                 )
 
+                # compute qparams for each block
+                for i in range(num_br):
+                    r0 = i * block_rows
+                    r1 = min((i + 1) * block_rows, rows)
+                    for j in range(num_bc):
+                        c0 = j * block_cols
+                        c1 = min((j + 1) * block_cols, cols)
+                        # reduce across both dims to get one scale and zp per block
+                        scale_bp, zp_bp = self.calculate_qparams(
+                            observed[r0:r1, c0:c1], reduce_dims=(0, 1)
+                        )
+                        self._scale[i, j] = scale_bp
+                        self._zero_point[i, j] = zp_bp
+
         return self._scale, self._zero_point
 
     def get_qparams_along_dim(
diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py
@@ -35,6 +35,34 @@ def q_config_kwargs(config_0, config_1):
     )
 
 
+@pytest.fixture
+def block_q_config_kwargs():
+    return dict(
+        config_groups=dict(
+            group_block=dict(
+                targets=["Linear"],
+                input_activations=dict(
+                    num_bits=8, symmetric=True, strategy="group", group_size=128
+                ),
+                weights=dict(
+                    num_bits=8,
+                    symmetric=True,
+                    strategy="block",
+                    block_structure=[128, 128],
+                ),
+            ),
+        )
+    )
+
+
+def test_block_strategy_parsing(block_q_config_kwargs):
+    modifier = GPTQModifier(**block_q_config_kwargs)
+    resolved = modifier.resolve_quantization_config()
+    w_scheme = resolved.config_groups["group_block"].weights
+    assert w_scheme.strategy == "block"
+    assert w_scheme.block_structure == [128, 128]
+
+
 @pytest.mark.parametrize(
     "has_actorder,actorder,config_0,config_1,expected_0,expected_1",
     [