Fix fused moe (ROCm#506)

gshtras · tjtanaa · hongxiayang · web-flow · commit 8826599f2ca5 · 2025-04-07T18:09:34.000-04:00
* Added the extra use_irope parameter in

Co-authored-by: Hongxia Yang &lt;hongxia.yang@amd.com&gt;
Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;

* Fix ROCm V1 Engine Fused MoE Bug

Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;

* Add warning message that V0 do not support irope

Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;

---------

Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;
Co-authored-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;
Co-authored-by: Hongxia Yang &lt;hongxia.yang@amd.com&gt;
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -462,11 +462,15 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             self.logits_soft_cap = 0.0
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -461,6 +461,20 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights is not None or not mul_routed_weight
+    if current_platform.is_rocm() and topk_weights is not None:
+        # This is to handle the bug https://github.com/ROCm/pytorch/issues/2020
+        # where the In the HIPGraph, it could occur that the `topk_weights`
+        # tensor has the following properties:
+        # .shape: ([1024, 1])
+        # .is_contiguous(): True
+        # .stride() : [1,1024]
+        # .is_contiguous(memory_format=torch.channels_last) is False
+        # .is_contiguous(memory_format=torch.contiguous_format) is True
+        # This only happens when using V1 Engine on ROCm with HIPGraph
+        # with torch.compile Dynamo.
+        # V1 Engine on ROCm with eager mode is fine.
+        # V0 Engine on ROCm with HIPGraph is fine.
+        topk_weights = topk_weights.view(-1).reshape(topk_weights.shape)
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1