[ROCm][Bugfix][FP8] Make fp8 quant respect fused modules mapping (#16031)

mgoin · web-flow · commit 21802c4b6d96 · 2025-04-07T21:28:14.000-04:00
Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -116,7 +116,9 @@ def get_quant_method(self, layer: torch.nn.Module,
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):