Merge remote-tracking branch 'origin/main' into grouped_topk_cuda

sangchengmeng · sangchengmeng · commit 552900a0896e · 2025-02-11T08:05:17.000Z
diff --git a/lightllm/common/fused_moe/grouped_topk.py b/lightllm/common/fused_moe/grouped_topk.py
@@ -0,0 +1,231 @@
+# adopt from https://github.com/triton-lang/triton/issues/3698#issuecomment-2067681396
+import torch
+import triton
+import triton.language as tl
+from triton.language.standard import _log2, sum, zeros_like
+
+
+@triton.jit
+def _compare_and_swap(x, ids, flip, i: tl.core.constexpr, n_dims: tl.core.constexpr):
+    n_outer: tl.core.constexpr = x.numel >> n_dims
+    shape: tl.core.constexpr = [n_outer * 2 ** i, 2, 2 ** (n_dims - i - 1)]
+    y = tl.core.reshape(x, shape)
+    # slice left/right with 'stride' 2**(n_dims - i - 1)
+    mask = tl.core.arange(0, 2)[None, :, None]
+    left = tl.core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape)
+    right = tl.core.broadcast_to(sum(y * mask, 1)[:, None, :], shape)
+    left = tl.core.reshape(left, x.shape)
+    right = tl.core.reshape(right, x.shape)
+
+    # idx
+    y_idx = tl.core.reshape(ids, shape)
+    left_idx = tl.core.broadcast_to(sum(y_idx * (1 - mask), 1)[:, None, :], shape)
+    right_idx = tl.core.broadcast_to(sum(y_idx * mask, 1)[:, None, :], shape)
+    left_idx = tl.core.reshape(left_idx, x.shape)
+    right_idx = tl.core.reshape(right_idx, x.shape)
+
+    # actual compare-and-swap
+    idtype = tl.core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    ileft = left.to(idtype, bitcast=True)
+    iright = right.to(idtype, bitcast=True)
+    ix = x.to(idtype, bitcast=True)
+
+    cond = (left > right) ^ flip
+
+    ret = ix ^ tl.core.where(cond, ileft ^ iright, zeros_like(ix))
+
+    new_ids = ids ^ tl.core.where(cond, left_idx ^ right_idx, zeros_like(ids))
+
+    return ret.to(x.dtype, bitcast=True), new_ids
+
+
+@triton.jit
+def _bitonic_merge(x, ids, stage: tl.core.constexpr, order: tl.core.constexpr, n_dims: tl.core.constexpr):
+    """
+    order_type 0 == ascending
+    order_type 1 == descending
+    order_type 2 == alternating
+    """
+    n_outer: tl.core.constexpr = x.numel >> n_dims
+    tl.core.static_assert(stage <= n_dims)
+    # flip denotes whether to re-arrange sub-sequences of elements in ascending or
+    # descending order.
+    # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
+    # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
+    # a stride of 2) at this stage
+    if order == 2:
+        shape: tl.core.constexpr = [n_outer * 2 ** (n_dims - 1 - stage), 2, 2 ** stage]
+        flip = tl.core.reshape(tl.core.broadcast_to(tl.core.arange(0, 2)[None, :, None], shape), x.shape)
+    else:
+        flip = order
+    # perform `stage` rounds of `compare-and-swap`
+    for i in tl.core.static_range(stage):
+        x, ids = _compare_and_swap(x, ids, flip, i + (n_dims - stage), n_dims)
+    return x, ids
+
+
+@triton.jit
+def argsort(x, ids, dim: tl.core.constexpr = None, descending: tl.core.constexpr = tl.core.CONSTEXPR_0):
+    # handle default dimension or check that it is the most minor dim
+    _dim: tl.core.constexpr = len(x.shape) - 1 if dim is None else dim
+    tl.core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    # iteratively run bitonic merge-sort steps
+    n_dims: tl.core.constexpr = _log2(x.shape[_dim])
+
+    for i in tl.core.static_range(1, n_dims + 1):
+        x, ids = _bitonic_merge(x, ids, i, 2 if i < n_dims else descending, n_dims)
+    return x, ids
+
+
+@triton.jit
+def grouped_topk_kernel(
+    gating_output_ptr,
+    gating_output_stride_m,
+    gating_output_stride_n,
+    correction_bias_ptr,
+    scores_buffer_ptr,  # [token_num, total_expert_num]
+    scores_stride_m,
+    scores_stride_n,
+    scores_stride_token_m,
+    scores_stride_group,
+    scores_stride_group_v,
+    out_topk_weights,
+    out_topk_weights_stride_m,
+    out_topk_weights_stride_n,
+    out_topk_ids,
+    out_topk_ids_stride_m,
+    out_topk_ids_stride_n,
+    group_num,
+    group_expert_num,
+    total_expert_num,  # group_num * group_expert_num == total_expert_num
+    topk_num,
+    group_topk_num,
+    IS_SIGMOID: tl.constexpr,
+    HAS_CORRECTION_BIAS: tl.constexpr,
+    EXPERT_BLOCK_SIZE: tl.constexpr,  # tl.next_power_two_of(total_expert_num)
+    EXPERT_GROUP_NUM: tl.constexpr,  # tl.next_power_two_of(group_num)
+    EXPERT_GROUP_SIZE: tl.constexpr,  # tl.next_power_two_of(group_expert_num)
+    RENORMALIZE: tl.constexpr,
+):
+    token_index = tl.program_id(axis=0)
+    offs_n = tl.arange(0, EXPERT_BLOCK_SIZE)
+    hidden_states = tl.load(
+        gating_output_ptr + token_index * gating_output_stride_m + offs_n,
+        mask=offs_n < total_expert_num,
+        other=-10000000.0,
+    )
+    if IS_SIGMOID:
+        scores = tl.sigmoid(hidden_states)
+    else:
+        scores = tl.softmax(hidden_states)
+
+    if HAS_CORRECTION_BIAS:
+        scores += tl.load(correction_bias_ptr + offs_n, mask=offs_n < total_expert_num, other=-10000000.0)
+
+    offs_group = tl.arange(0, EXPERT_GROUP_NUM)
+    offs_group_v = tl.arange(0, EXPERT_GROUP_SIZE)
+    tl.store(scores_buffer_ptr + scores_stride_m * token_index + offs_n, scores, mask=offs_n < total_expert_num)
+    group_scores = tl.load(
+        scores_buffer_ptr
+        + scores_stride_token_m * token_index
+        + offs_group[:, None] * scores_stride_group
+        + offs_group_v[None, :] * scores_stride_group_v,
+        mask=(offs_group < group_num)[:, None] & (offs_group_v < group_expert_num)[None, :],
+        other=-10000000.0,
+    )  # [group, group_size]
+
+    group_value = tl.max(group_scores, axis=1)  # [group,]
+    sorted_group_value = tl.sort(group_value, descending=True)
+    group_topk_value = tl.sum(tl.where(offs_group == group_topk_num - 1, sorted_group_value, 0.0))
+    mask_group_scores = tl.where(
+        ((group_value >= group_topk_value)[:, None]) & ((offs_group_v < group_expert_num)[None, :]),
+        group_scores,
+        -10000000.0,
+    )
+
+    tl.store(
+        scores_buffer_ptr
+        + scores_stride_token_m * token_index
+        + offs_group[:, None] * scores_stride_group
+        + offs_group_v[None, :] * scores_stride_group_v,
+        mask_group_scores,
+        mask=((offs_group < group_num)[:, None]) & ((offs_group_v < group_expert_num)[None, :]),
+    )  # [group, group_size]
+
+    mask_scores = tl.load(
+        scores_buffer_ptr + scores_stride_m * token_index + offs_n, mask=offs_n < total_expert_num, other=-10000000.0
+    )
+    sorted_scores, sorted_indexes = argsort(mask_scores, offs_n, descending=True)
+
+    if RENORMALIZE:
+        sum_scores = tl.sum(tl.where(offs_n < topk_num, sorted_scores, 0.0))
+        renormlize_scores = sorted_scores / sum_scores
+
+        tl.store(
+            out_topk_weights + token_index * out_topk_weights_stride_m + offs_n,
+            renormlize_scores,
+            mask=offs_n < topk_num,
+        )
+        tl.store(out_topk_ids + token_index * out_topk_ids_stride_m + offs_n, sorted_indexes, mask=offs_n < topk_num)
+    else:
+        tl.store(
+            out_topk_weights + token_index * out_topk_weights_stride_m + offs_n, sorted_scores, mask=offs_n < topk_num
+        )
+        tl.store(out_topk_ids + token_index * out_topk_ids_stride_m + offs_n, sorted_indexes, mask=offs_n < topk_num)
+    return
+
+
+def triton_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+):
+
+    if correction_bias is not None:
+        has_correction_bias = True
+    else:
+        has_correction_bias = False
+
+    token_num, total_expert_num = gating_output.shape
+    if gating_output.dtype == torch.float64:
+        dtype = torch.float64
+    else:
+        dtype = torch.float32
+
+    scores_buffer = torch.empty((token_num, total_expert_num), dtype=dtype, device="cuda")
+    out_topk_weights = torch.empty((token_num, topk), dtype=torch.float32, device="cuda")
+    out_topk_ids = torch.empty((token_num, topk), dtype=torch.int32, device="cuda")
+
+    assert total_expert_num % num_expert_group == 0
+
+    grouped_topk_kernel[(token_num,)](
+        gating_output,
+        *gating_output.stride(),
+        correction_bias,
+        scores_buffer,
+        *scores_buffer.stride(),
+        *scores_buffer.view(token_num, num_expert_group, -1).stride(),
+        out_topk_weights,
+        *out_topk_weights.stride(),
+        out_topk_ids,
+        *out_topk_ids.stride(),
+        group_num=num_expert_group,
+        group_expert_num=total_expert_num // num_expert_group,
+        total_expert_num=total_expert_num,
+        topk_num=topk,
+        group_topk_num=topk_group,
+        IS_SIGMOID=scoring_func == "sigmoid",
+        HAS_CORRECTION_BIAS=has_correction_bias,
+        EXPERT_BLOCK_SIZE=triton.next_power_of_2(total_expert_num),
+        EXPERT_GROUP_NUM=triton.next_power_of_2(num_expert_group),
+        EXPERT_GROUP_SIZE=triton.next_power_of_2(total_expert_num // num_expert_group),
+        RENORMALIZE=renormalize,
+        num_warps=1,
+        num_stages=1,
+    )
+    return out_topk_weights, out_topk_ids
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -24,6 +24,7 @@
 
 use_cuda_grouped_topk = os.environ.get("GROUPED_TOPK_CUDA", "false").lower()
 
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -63,7 +64,7 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-    
+
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     if scoring_func == "sigmoid":
         scores = torch.sigmoid(gating_output)
@@ -91,8 +92,9 @@ def grouped_topk(
 
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
+
 # This is used by the Deepseek-V2 model
-def grouped_topk_cuda(
+def cuda_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     correction_bias: torch.Tensor,
@@ -105,27 +107,26 @@ def grouped_topk_cuda(
 
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     num_tokens = gating_output.shape[0]
-    num_experts = gating_output.shape[-1]
     topk_weights = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.float32)
     topk_indices = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.int32)
     token_expert_indices = torch.empty(num_tokens, topk_group, device=hidden_states.device, dtype=torch.int32)
-    group_scores  = torch.empty(num_tokens, num_expert_group, device=hidden_states.device, dtype=torch.float32)
-    if correction_bias is None: 
-        correction_bias = torch.zeros_like(gating_output,dtype=torch.float32)
+    group_scores = torch.empty(num_tokens, num_expert_group, device=hidden_states.device, dtype=torch.float32)
+    if correction_bias is None:
+        correction_bias = torch.zeros_like(gating_output, dtype=torch.float32)
     ops.grouped_topk(
-            topk_weights, 
-            correction_bias, 
-            topk_indices, 
-            token_expert_indices, 
-            gating_output.float(), 
-            num_expert_group, 
-            topk_group, 
-            topk, 
-            renormalize, 
-            scoring_func,
-            group_scores
+        topk_weights,
+        correction_bias,
+        topk_indices,
+        token_expert_indices,
+        gating_output.float(),
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        scoring_func,
+        group_scores,
     )
-    
+
     return topk_weights, topk_indices
 
 
@@ -141,14 +142,15 @@ def select_experts(
     scoring_func: str = "softmax",
     custom_routing_function: Optional[Callable] = None,
 ):
-    from lightllm.common.fused_moe.topk_select import fused_topk, grouped_topk
+    from lightllm.common.fused_moe.topk_select import fused_topk
+    from lightllm.common.fused_moe.grouped_topk import triton_grouped_topk
+
     # DeekSeekv2 uses grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None
         if use_cuda_grouped_topk == "true":
-            from lightllm.common.vllm_kernel import _custom_ops as ops
-            topk_weights, topk_ids = grouped_topk_cuda(
+            topk_weights, topk_ids = cuda_grouped_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
                 correction_bias=correction_bias,
@@ -159,7 +161,7 @@ def select_experts(
                 scoring_func=scoring_func,
             )
         else:
-            topk_weights, topk_ids = grouped_topk(
+            topk_weights, topk_ids = triton_grouped_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
                 correction_bias=correction_bias,
diff --git a/lightllm/common/vllm_kernel/_ops.py b/lightllm/common/vllm_kernel/_ops.py
@@ -760,6 +760,7 @@ def topk_softmax(
 ) -> None:
     torch.ops.vllm_moe.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
+
 def grouped_topk(
     topk_weights: torch.Tensor,
     correction_bias: torch.Tensor,
@@ -771,13 +772,23 @@ def grouped_topk(
     topk: int,
     renormalize: bool,
     scoring_func: str,
-    group_scores: torch.Tensor = None
+    group_scores: torch.Tensor = None,
 ) -> None:
     torch.ops.vllm_moe.grouped_topk(
-        topk_weights, correction_bias, topk_indices, group_indices, gating_output, num_expert_group, 
-        topk_group, topk, renormalize, scoring_func, group_scores
+        topk_weights,
+        correction_bias,
+        topk_indices,
+        group_indices,
+        gating_output,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        scoring_func,
+        group_scores,
     )
 
+
 def reshape_and_cache(
     key: torch.Tensor,
     value: torch.Tensor,
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -154,7 +154,7 @@ def _decompress_kv(self, kv, infer_state: Deepseek2InferStateInfo, layer_weight:
             )
 
         # CC
-        compressed_kv = compressed_kv.view(-1, layer_weight.kv_lora_rank)
+        compressed_kv = compressed_kv.view(-1, layer_weight.kv_lora_rank).contiguous()
         k_nope = self.alloc_tensor(
             [compressed_kv.shape[0], self.tp_q_head_num_, self.qk_nope_head_dim],
             dtype=compressed_kv.dtype,
@@ -163,10 +163,8 @@ def _decompress_kv(self, kv, infer_state: Deepseek2InferStateInfo, layer_weight:
             k_nope.shape,
             dtype=compressed_kv.dtype,
         )
-        wk = layer_weight.k_b_proj_.weight.view(-1, layer_weight.kv_lora_rank).T
-        wv = layer_weight.v_b_proj_.weight.transpose(0, 1).reshape(layer_weight.kv_lora_rank, -1)
-        torch.mm(compressed_kv, wk, out=k_nope.reshape(compressed_kv.shape[0], -1))
-        torch.mm(compressed_kv, wv, out=v.reshape(compressed_kv.shape[0], -1))
+        layer_weight.cc_k_b_proj_.mm(compressed_kv, out=k_nope.reshape(compressed_kv.shape[0], -1))
+        layer_weight.cc_v_b_proj_.mm(compressed_kv, out=v.reshape(compressed_kv.shape[0], -1))
         return k_nope, k_rope, v
 
     def _context_attention_kernel_with_CC(
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
diff --git a/unit_tests/common/fused_moe/test_grouped_topk.py b/unit_tests/common/fused_moe/test_grouped_topk.py