[Bug] Fix Layer weight_block_size Assertion Issue (#24674)

yewentao256 · web-flow · commit fcba05c4353c · 2025-09-11T19:47:59.000-04:00
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -450,10 +450,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             # Activations not quantized for marlin.
             del layer.input_scale
 
-        # On B200, if E8M0 for DeepGemm is used, we need to
+        # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -905,7 +905,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)