[Bug] [ROCm] Fix Llama 4 Enablement Bug on ROCm: V0 ROCmFlashAttentionImpl and Triton Fused MoE bugs (#16198)

tjtanaa · hongxiayang · kliuae · web-flow · commit 2976dc27e9dc · 2025-04-08T19:12:34.000-07:00
Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;
Signed-off-by: kliuae &lt;kuanfu.liu@embeddedllm.com&gt;
Co-authored-by: Hongxia Yang &lt;hongxia.yang@amd.com&gt;
Co-authored-by: kliuae &lt;kuanfu.liu@embeddedllm.com&gt;
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -471,7 +471,10 @@ def __init__(
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             self.logits_soft_cap = 0.0
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1002,6 +1002,7 @@ def inplace_fused_experts_fake(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
@@ -1060,6 +1061,7 @@ def outplace_fused_experts_fake(
     op_func=outplace_fused_experts,
     mutates_args=[],
     fake_impl=outplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -40,7 +40,7 @@
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Type, TypeVar, Union, cast, overload)
+                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
 from uuid import uuid4
 
 import cachetools
@@ -1935,12 +1935,13 @@ def __getattr__(self, key: str):
 
 
 def direct_register_custom_op(
-    op_name: str,
-    op_func: Callable,
-    mutates_args: list[str],
-    fake_impl: Optional[Callable] = None,
-    target_lib: Optional[Library] = None,
-    dispatch_key: str = "CUDA",
+        op_name: str,
+        op_func: Callable,
+        mutates_args: list[str],
+        fake_impl: Optional[Callable] = None,
+        target_lib: Optional[Library] = None,
+        dispatch_key: str = "CUDA",
+        tags: Tuple[torch.Tag, ...] = (),
 ):
     """
     `torch.library.custom_op` can have significant overhead because it
@@ -1979,7 +1980,7 @@ def direct_register_custom_op(
         import torch._custom_op.impl
         schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
-    my_lib.define(op_name + schema_str)
+    my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
     if fake_impl is not None:
         my_lib._register_fake(op_name, fake_impl)

Original file line number	Diff line number	Diff line change
`@@ -1002,6 +1002,7 @@ def inplace_fused_experts_fake(`
`1002`	`1002`	`op_func=inplace_fused_experts,`
`1003`	`1003`	`mutates_args=["hidden_states"],`
`1004`	`1004`	`fake_impl=inplace_fused_experts_fake,`
	`1005`	`+ tags=(torch.Tag.needs_fixed_stride_order, ),`
`1005`	`1006`	`)`
`1006`	`1007`
`1007`	`1008`
`@@ -1060,6 +1061,7 @@ def outplace_fused_experts_fake(`
`1060`	`1061`	`op_func=outplace_fused_experts,`
`1061`	`1062`	`mutates_args=[],`
`1062`	`1063`	`fake_impl=outplace_fused_experts_fake,`
	`1064`	`+ tags=(torch.Tag.needs_fixed_stride_order, ),`
`1063`	`1065`	`)`
`1064`	`1066`
`1065`	`1067`