add aiter fp8 block scaled moe and w8a8 block gemm kernels

vllmellm · vllmellm · commit c5b0cf953527 · 2025-03-07T09:09:12.000Z
Signed-off-by: vllmellm &lt;vllm.ellm@embeddedllm.com&gt;
diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="0508c8df"
+ARG AITER_BRANCH="e1ec015"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -25,6 +25,8 @@
     VLLM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_USE_AITER_LINEAR: bool = False
     VLLM_USE_AITER_NORM: bool = False
+    VLLM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
+    VLLM_USE_AITER_W8A8_BLOCK_GEMM: bool = False
     RANK: int = 0
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
@@ -322,6 +324,18 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
      ("true", "1") and os.getenv("VLLM_USE_AITER_NORM", "True").lower() in
      ("true", "1")),
 
+    # use ater fp8 block scaled moe kernel op if ater ops are enabled.
+    "VLLM_USE_AITER_FP8_BLOCK_SCALED_MOE":
+    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_USE_AITER_FP8_BLOCK_SCALED_MOE",
+                                         "False").lower() in ("true", "1")),
+
+    # use ater w8a8 block gemm kernel op if ater ops are enabled.
+    "VLLM_USE_AITER_W8A8_BLOCK_GEMM":
+    lambda: (os.getenv("VLLM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_USE_AITER_W8A8_BLOCK_GEMM",
+                                         "False").lower() in ("true", "1")),
+
     # rank of the process in the distributed setting, used to determine
     # the driver worker
     "RANK":
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -35,7 +35,8 @@
 from vllm.utils import is_navi
 
 if envs.VLLM_USE_AITER_MOE:
-    from aiter.fused_moe_bf16_asm import asm_moe
+    import aiter
+    from aiter.fused_moe_bf16_asm import asm_moe, moe_sorting_ck
     from aiter.ops.shuffle import shuffle_weight
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -608,6 +609,14 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
+            if envs.VLLM_USE_AITER_FP8_BLOCK_SCALED_MOE:
+                layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                    layer.w13_weight.data),
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffle_weight(
+                    layer.w2_weight.data),
+                                                     requires_grad=False)
+
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -798,6 +807,8 @@ def apply(
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            per_token_group_quant_fp8)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -812,6 +823,52 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
         )
 
+        if envs.VLLM_USE_AITER_FP8_BLOCK_SCALED_MOE:
+            w1 = layer.w13_weight
+            w2 = layer.w2_weight
+            w1_scale = (layer.w13_weight_scale_inv
+                        if self.block_quant else layer.w13_weight_scale)
+            w2_scale = (layer.w2_weight_scale_inv
+                        if self.block_quant else layer.w2_weight_scale)
+
+            block_shape = self.quant_config.weight_block_size
+            # The default block sizes are 128 in AITER.
+            if block_shape is None:
+                block_shape = [128, 128]
+
+            local_E = E = w1.shape[0]
+            topk = topk_ids.shape[1]
+            model_dim = w1.shape[-1]
+            dtype = x.dtype
+            scale_blk_k = block_shape[1]
+
+            (
+                sorted_token_ids,
+                sorted_weight_buf,
+                sorted_expert_ids,
+                num_valid_ids,
+                out_asm,
+            ) = moe_sorting_ck(topk_ids, topk_weights, E, model_dim, dtype)
+            a1, a1_scale = per_token_group_quant_fp8(x, scale_blk_k)
+            aiter.fmoe_fp8_blockscale_g1u1(
+                out_asm,
+                a1,
+                w1,
+                w2,
+                sorted_token_ids,
+                sorted_weight_buf,
+                sorted_expert_ids,
+                num_valid_ids,
+                topk,
+                w1_scale.view(local_E, -1),
+                w2_scale.view(local_E, -1),
+                a1_scale.t().contiguous(),
+                block_shape[0],
+                block_shape[1],
+                None,
+            )
+            return out_asm
+
         if envs.VLLM_USE_AITER_MOE:
             return asm_moe(hidden_states=x,
                            w1=layer.w13_weight,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -10,6 +10,7 @@
 import triton
 import triton.language as tl
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -71,12 +72,22 @@ def apply_w8a8_block_fp8_linear(
         q_input, x_scale = per_token_group_quant_fp8(input_2d,
                                                      block_size[1],
                                                      column_major_scales=False)
-        output = w8a8_block_fp8_matmul(q_input,
-                                       weight,
-                                       x_scale,
-                                       weight_scale,
-                                       block_size,
-                                       output_dtype=input.dtype)
+        if envs.VLLM_USE_AITER_W8A8_BLOCK_GEMM:
+            import aiter
+            output = torch.zeros(
+                [q_input.shape[0], weight.shape[0]],
+                dtype=input.dtype,
+                device=q_input.device,
+            )
+            aiter.gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale,
+                                       output)
+        else:
+            output = w8a8_block_fp8_matmul(q_input,
+                                           weight,
+                                           x_scale,
+                                           weight_scale,
+                                           block_size,
+                                           output_dtype=input.dtype)
     if bias is not None:
         output = output + bias
     return output.to(dtype=input.dtype).view(*output_shape)