Torch.ops (#1001)

sufubao · web-flow · commit 292653208a73 · 2025-08-07T00:45:51.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
@@ -76,9 +76,9 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         w2, w2_scale = self.w2
         use_fp8_w8a8 = self.quant_method is not None
 
-        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl
+        from lightllm.common.fused_moe.grouped_fused_moe import fused_experts
 
-        fused_experts_impl(
+        fused_experts(
             hidden_states=input_tensor,
             w1=w1,
             w2=w2,
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -34,8 +34,9 @@
 from .moe_silu_and_mul import silu_and_mul_fwd
 from .moe_sum_reduce import moe_sum_reduce
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
+from lightllm.utils.torch_ops_utils import direct_register_custom_op
 
-FFN_MOE_CHUNK_SIZE = 8 * 1024
+FFN_MOE_CHUNK_SIZE = 32 * 1024
 
 logger = init_logger(__name__)
 
@@ -355,7 +356,7 @@ def grouped_matmul_kernel(
     tile_n_idx = pid_n
 
     # get the gemm size of the current problem
-    cur_m = tl.load(expert_to_token_num + expert_id, eviction_policy="evict_last")
+    cur_m = tl.load(expert_to_token_num + expert_id)
 
     # do regular gemm here
     offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
@@ -461,9 +462,8 @@ def grouped_matmul(
     out: torch.Tensor,
     mul_routed_weight: bool,
     use_fp8_w8a8: bool,
-    alloc_tensor_func=torch.empty,
     reused_mblock_infos=None,
-    **run_config,
+    run_config: Optional[dict] = None,
 ):
     """
     token_num_mul_topk_num is int equal token_num * topk_num,
@@ -493,7 +493,8 @@ def grouped_matmul(
         if expert_to_weights_scale.ndim == 3:
             block_size_n = expert_weights.shape[1] // expert_to_weights_scale.shape[1]
             block_size_k = expert_weights.shape[2] // expert_to_weights_scale.shape[2]
-    if not run_config:
+    
+    if run_config is None:
         run_config = MoeGroupedGemmKernelConfig.try_to_get_best_config(
             M=token_inputs.shape[0],
             N=n,
@@ -525,8 +526,8 @@ def grouped_matmul(
         else:
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
-            input_scale = alloc_tensor_func((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
-            qinput_tensor = alloc_tensor_func((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
+            input_scale = torch.empty((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
+            qinput_tensor = torch.empty((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
             per_token_group_quant_fp8(token_inputs, block_size_k, qinput_tensor, input_scale)
             token_inputs, token_input_scale = qinput_tensor, input_scale
 
@@ -611,8 +612,7 @@ def fused_experts_impl(
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    alloc_tensor_func=torch.empty,
-    **run_config,
+    run_config: Optional[dict] = None,
 ):
     # Check constraints.
     assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
@@ -627,18 +627,16 @@ def fused_experts_impl(
     topk_num = topk_ids.shape[1]
     M = min(num_tokens, CHUNK_SIZE)
 
-    intermediate_cache1 = alloc_tensor_func((M, topk_num, N), device=hidden_states.device, dtype=hidden_states.dtype)
-    intermediate_cache2 = alloc_tensor_func(
-        (M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype
-    )
-    intermediate_cache3 = alloc_tensor_func(
-        (M, topk_num, w2.shape[1]), device=hidden_states.device, dtype=hidden_states.dtype
-    )
+    cache = torch.empty(M*topk_num*max(N, w2.shape[1]), device=hidden_states.device, dtype=hidden_states.dtype)
+    intermediate_cache1 = cache[:M * topk_num * N].view(M, topk_num, N)
+    intermediate_cache2 = torch.empty((M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
+    intermediate_cache3 = cache[:M * topk_num * w2.shape[1]].view(M, topk_num, w2.shape[1])
+
 
     if inplace:
         out_hidden_states = hidden_states
     else:
-        out_hidden_states = alloc_tensor_func(
+        out_hidden_states = torch.empty(
             hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype
         )
 
@@ -647,9 +645,10 @@ def fused_experts_impl(
         curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
         tokens_in_chunk, _ = curr_hidden_states.shape
 
-        intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-        intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
-        intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
 
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
@@ -673,8 +672,7 @@ def fused_experts_impl(
             out=intermediate_cache1.view(-1, N),
             mul_routed_weight=False,
             use_fp8_w8a8=use_fp8_w8a8,
-            alloc_tensor_func=alloc_tensor_func,
-            **run_config,
+            run_config=run_config,
         )
 
         silu_and_mul_fwd(intermediate_cache1.view(-1, N), intermediate_cache2.view(-1, N // 2))
@@ -692,12 +690,156 @@ def fused_experts_impl(
             out=intermediate_cache3.view(-1, w2.shape[1]),
             mul_routed_weight=True,
             use_fp8_w8a8=use_fp8_w8a8,
-            alloc_tensor_func=alloc_tensor_func,
             reused_mblock_infos=reused_mblock_infos,
-            **run_config,
+            run_config=run_config,
         )
 
         moe_sum_reduce(
             intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx]
         )
     return out_hidden_states
+
+
+def inplace_fused_experts_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+)-> None:
+    fused_experts_impl(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        True,
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        w1_scale,
+        w2_scale,
+        a1_scale,
+        a2_scale,
+    )
+
+def inplace_fused_experts_impl_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+)-> None:
+    pass
+
+direct_register_custom_op(
+    "inplace_fused_experts_impl",
+    inplace_fused_experts_impl,
+    ["hidden_states"],
+    inplace_fused_experts_impl_fake,
+)
+
+def outplace_fused_experts_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+)-> None:
+    return fused_experts_impl(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        False,
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        w1_scale,
+        w2_scale,
+        a1_scale,
+        a2_scale,
+    )
+
+def outplace_fused_experts_impl_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+)-> None:
+    return torch.empty_like(hidden_states)
+
+direct_register_custom_op(
+    "outplace_fused_experts_impl",
+    outplace_fused_experts_impl,
+    [],
+    outplace_fused_experts_impl_fake,
+)
+
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+):
+    if inplace:
+        torch.ops.lightllm.inplace_fused_experts_impl(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            w1_scale,
+            w2_scale,
+            a1_scale,
+            a2_scale,   
+        )
+        return hidden_states
+    else:
+        return torch.ops.lightllm.outplace_fused_experts_impl(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            w1_scale,
+            w2_scale,
+            a1_scale,
+            a2_scale,
+        )
diff --git a/lightllm/utils/torch_ops_utils.py b/lightllm/utils/torch_ops_utils.py
@@ -0,0 +1,55 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/utils.py
+from torch.library import Library
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Protocol,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import torch
+
+lightllm_lib = Library("lightllm", "FRAGMENT")  # noqa
+
+
+# Some backends use pytorch version < 2.4.0 which doesn't
+# support `torch.library.custom_op`.
+def supports_custom_op() -> bool:
+    return hasattr(torch.library, "custom_op")
+
+
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: List[str],
+    fake_impl: Optional[Callable] = None,
+    target_lib: Optional[Library] = None,
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+    """
+    import torch.library
+
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
+
+    my_lib = target_lib or lightllm_lib
+    my_lib.define(op_name + schema_str)
+    my_lib.impl(op_name, op_func, "CUDA")
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
diff --git a/test/kernel/fuse_moe_tuning.py b/test/kernel/fuse_moe_tuning.py