fix_vllm_quant (ROCm#342)

lihaoyang-amd · web-flow · commit d567353052b9 · 2024-12-20T10:23:31.000-05:00
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -423,6 +423,26 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
+
+            if envs.VLLM_MOE_SHUFFLE:
+                layer.w13_weight.data = permute_weight_fp8(layer.w13_weight.data)
+                layer.w2_weight.data = permute_weight_fp8(layer.w2_weight.data)
+
+            if envs.VLLM_MOE_PADDING:
+                pad_size = 256
+                layer.w13_weight = torch.nn.Parameter(
+                    F.pad(layer.w13_weight.data, (0, pad_size), "constant",
+                          0)[..., :-pad_size],
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    F.pad(layer.w2_weight.data, (0, pad_size), "constant",
+                          0)[..., :-pad_size],
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+
             return
 
         # If checkpoint is fp8, we need to handle that the