ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/destindex_copy_kv.py‎
Lines changed: 54 additions & 22 deletions b/‎lightllm/common/basemodel/triton_kernel/destindex_copy_kv.py‎
Lines changed: 54 additions & 22 deletions
diff --git a/‎lightllm/distributed/communication_op.py‎
Lines changed: 6 additions & 9 deletions b/‎lightllm/distributed/communication_op.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎lightllm/distributed/custom_all_gather.py‎
Lines changed: 5 additions & 2 deletions b/‎lightllm/distributed/custom_all_gather.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py‎
Lines changed: 6 additions & 8 deletions b/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_fp8.py‎
Lines changed: 4 additions & 6 deletions b/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_fp8.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py‎
Lines changed: 4 additions & 6 deletions b/‎lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎lightllm/models/deepseek2/triton_kernel/sample_kv.py‎
Lines changed: 4 additions & 5 deletions b/‎lightllm/models/deepseek2/triton_kernel/sample_kv.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎lightllm/models/internvl/model.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/models/internvl/model.py‎
Lines changed: 1 addition & 1 deletion
@@ -175,7 +175,7 @@ def _init_kv_move_buffer(self):
 
     def _check_mem_size(self):
         self.max_total_token_num = self.mem_manager.size
-        assert self.max_seq_length < self.max_total_token_num
+        assert self.max_seq_length <= self.max_total_token_num
         return
 
     def _init_req_manager(self):
 
@@ -3,7 +3,7 @@
 import threading
 from typing import Optional, Tuple, List, Dict, Any
 from .base_weight import BaseWeight
-from lightllm.utils.dist_utils import get_global_rank, get_current_device_id
+from lightllm.utils.dist_utils import get_current_rank_in_dp, get_current_device_id
 from lightllm.common.quantization import Quantcfg
 
 
@@ -37,7 +37,7 @@ def __init__(
         self.n_routed_experts = n_routed_experts
         self.split_inter_size = split_inter_size
         self.data_type_ = data_type
-        self.tp_rank_ = get_global_rank()
+        self.tp_rank_ = get_current_rank_in_dp()
         self.experts_up_projs = [None] * self.n_routed_experts
         self.experts_gate_projs = [None] * self.n_routed_experts
         self.experts_up_proj_scales = [None] * self.n_routed_experts
 
@@ -6,19 +6,24 @@
 
 @triton.jit
 def _fwd_kernel_destindex_copy_kv(
-    K, Dest_loc,
+    K,
+    Dest_loc,
     Out,
-    stride_k_bs, stride_k_h, stride_k_d,
-    stride_o_bs, stride_o_h, stride_o_d,
+    stride_k_bs,
+    stride_k_h,
+    stride_k_d,
+    stride_o_bs,
+    stride_o_h,
+    stride_o_d,
     head_num,
     BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr
+    BLOCK_HEAD: tl.constexpr,
 ):
     cur_index = tl.program_id(0)
     offs_h = tl.arange(0, BLOCK_HEAD)
     offs_d = tl.arange(0, BLOCK_DMODEL)
 
-    dest_index = tl.load(Dest_loc + cur_index)
+    dest_index = tl.load(Dest_loc + cur_index).to(tl.int64)
 
     k_ptrs = K + cur_index * stride_k_bs + stride_k_h * offs_h[:, None] + stride_k_d * offs_d[None, :]
     o_ptrs = Out + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
@@ -39,9 +44,15 @@ def destindex_copy_kv(K, DestLoc, Out):
     num_warps = 1
 
     _fwd_kernel_destindex_copy_kv[grid](
-        K, DestLoc, Out,
-        K.stride(0), K.stride(1), K.stride(2),
-        Out.stride(0), Out.stride(1), Out.stride(2),
+        K,
+        DestLoc,
+        Out,
+        K.stride(0),
+        K.stride(1),
+        K.stride(2),
+        Out.stride(0),
+        Out.stride(1),
+        Out.stride(2),
         head_num,
         BLOCK_DMODEL=head_dim,
         BLOCK_HEAD=BLOCK_HEAD,
@@ -53,23 +64,35 @@ def destindex_copy_kv(K, DestLoc, Out):
 
 @triton.jit
 def _fwd_kernel_destindex_copy_quantize_kv(
-    K, Dest_loc, Out, Out_scale,
-    stride_k_bs, stride_k_h, stride_k_d,
-    stride_o_bs, stride_o_h, stride_o_d,
-    stride_os_bs, stride_os_h, stride_os_d,
+    K,
+    Dest_loc,
+    Out,
+    Out_scale,
+    stride_k_bs,
+    stride_k_h,
+    stride_k_d,
+    stride_o_bs,
+    stride_o_h,
+    stride_o_d,
+    stride_os_bs,
+    stride_os_h,
+    stride_os_d,
     head_num,
     BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr
+    BLOCK_HEAD: tl.constexpr,
 ):
     cur_index = tl.program_id(0)
     offs_h = tl.arange(0, BLOCK_HEAD)
     offs_d = tl.arange(0, BLOCK_DMODEL)
 
-    dest_index = tl.load(Dest_loc + cur_index)
-    src_data = tl.load(K + cur_index * stride_k_bs + offs_h[:, None] * stride_k_h + stride_k_d * offs_d[None, :], 
-                       mask=offs_h[:, None] < head_num, other=0.0)
+    dest_index = tl.load(Dest_loc + cur_index).to(tl.int64)
+    src_data = tl.load(
+        K + cur_index * stride_k_bs + offs_h[:, None] * stride_k_h + stride_k_d * offs_d[None, :],
+        mask=offs_h[:, None] < head_num,
+        other=0.0,
+    )
     abs_data = tl.abs(src_data)
-    data_scale = (tl.max(abs_data, axis=1) / 127.).to(Out_scale.dtype.element_ty)[:, None]
+    data_scale = (tl.max(abs_data, axis=1) / 127.0).to(Out_scale.dtype.element_ty)[:, None]
     q_src_data = (src_data / data_scale).to(tl.int8)
     o_ptrs = Out + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
     os_ptrs = Out_scale + dest_index * stride_os_bs + stride_os_h * offs_h[:, None]
@@ -88,10 +111,19 @@ def destindex_copy_quantize_kv(K, DestLoc, Out, Out_scale):
     num_warps = 1
 
     _fwd_kernel_destindex_copy_quantize_kv[grid](
-        K, DestLoc, Out, Out_scale,
-        K.stride(0), K.stride(1), K.stride(2),
-        Out.stride(0), Out.stride(1), Out.stride(2),
-        Out_scale.stride(0), Out_scale.stride(1), Out_scale.stride(2),
+        K,
+        DestLoc,
+        Out,
+        Out_scale,
+        K.stride(0),
+        K.stride(1),
+        K.stride(2),
+        Out.stride(0),
+        Out.stride(1),
+        Out.stride(2),
+        Out_scale.stride(0),
+        Out_scale.stride(1),
+        Out_scale.stride(2),
         head_num,
         BLOCK_DMODEL=head_dim,
         BLOCK_HEAD=BLOCK_HEAD,
@@ -149,6 +181,6 @@ def test2():
     print("cos ", cos(src.flatten().to(torch.float32), (value_dest * scale_dest).flatten().to(torch.float32)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test1()
     test2()
@@ -27,12 +27,11 @@
 from lightllm.utils.device_utils import has_nvlink
 from lightllm.utils.envs_utils import get_env_start_args, get_deepep_num_max_dispatch_tokens_per_rank
 from lightllm.utils.dist_utils import (
-    get_current_device_id,
-    get_node_world_size,
     get_global_world_size,
     get_dp_world_size,
     get_global_rank,
     get_current_rank_in_dp,
+    create_new_group_for_current_dp,
 )
 from lightllm.utils.device_utils import get_device_sm_count
 from lightllm.utils.sgl_utils import HAS_SGL_KERNEL
@@ -63,17 +62,15 @@ def __init__(self):
         self.custom_reduce = None
         self.custom_gather = None
         self.dp_world_size = get_dp_world_size()
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        self.device_group = dist.new_group(ranks, backend="nccl")
+        self.device_group = create_new_group_for_current_dp("nccl")
 
     def init_custom_reduce(self) -> None:
         if not HAS_SGL_KERNEL or not has_nvlink() or self.dp_world_size not in [2, 4, 6, 8]:
             return
         args = get_env_start_args()
         if args.disable_custom_allreduce:
             return
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        cpu_group = dist.new_group(ranks, backend="gloo")
+        cpu_group = create_new_group_for_current_dp("gloo")
         self.custom_reduce = CustomAllreduce(cpu_group, torch.cuda.current_device())
         logger.info("Enable Custom ALLReduce. You can disable it by settting --disable_custom_allreduce.")
 
@@ -82,10 +79,10 @@ def init_custom_gather(self) -> None:
             return
 
         args = get_env_start_args()
-        if args.disable_custom_allgather:
+        if not args.enable_custom_allgather:
             return
-        ranks = list([get_global_rank() - get_current_rank_in_dp() + i for i in range(self.dp_world_size)])
-        cpu_group = dist.new_group(ranks, backend="gloo")
+
+        cpu_group = create_new_group_for_current_dp("gloo")
         self.custom_gather = CustomAllgather(cpu_group, torch.cuda.current_device())
         logger.info("Enable Custom ALLGather.  You can disable it by settting --disable_custom_allgather")
 
 
@@ -31,8 +31,11 @@
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 
-if light_ops is not None:
-    light_ops.meta_size()
+try:
+    if light_ops is not None:
+        light_ops.meta_size()
+except:
+    pass
 
 logger = init_logger(__name__)
 
 
@@ -4,9 +4,7 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
-
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+from lightllm.utils.device_utils import is_tesla
 
 
 @triton.jit
@@ -165,7 +163,7 @@ def context_attention_fwd(
     softmax_scale,
 ):
 
-    BLOCK = 128 if not TESLA else 64
+    BLOCK = 128 if not is_tesla() else 64
     q_nope_dim = q_nope.shape[-1]
     q_rope_dim = q_rope.shape[-1]
     assert q_nope_dim == kv_nope.shape[-1]
@@ -174,9 +172,9 @@ def context_attention_fwd(
     assert q_rope_dim in {16, 32, 64, 128, 256}
 
     if q_nope_dim >= 512:
-        BLOCK = 32 if TESLA or CUDA_CAPABILITY[0] >= 9 else 64
+        BLOCK = 32 if is_tesla() or torch.cuda.get_device_capability()[0] >= 9 else 64
     else:
-        BLOCK = 128 if not TESLA else 64
+        BLOCK = 128 if not is_tesla() else 64
 
     if q_nope.dtype == torch.float32:
         BLOCK = BLOCK // 4
@@ -370,9 +368,9 @@ def context_attention_fwd_no_prompt_cache(
     assert q_rope_dim in {16, 32, 64, 128, 256}
 
     if q_nope_dim >= 512:
-        BLOCK = 32 if TESLA or CUDA_CAPABILITY[0] >= 9 else 64
+        BLOCK = 32 if is_tesla() or torch.cuda.get_device_capability()[0] >= 9 else 64
     else:
-        BLOCK = 128 if not TESLA else 64
+        BLOCK = 128 if not is_tesla() else 64
 
     if q_nope.dtype == torch.float32:
         BLOCK = BLOCK // 4
 
@@ -4,9 +4,7 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
-
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+from lightllm.utils.device_utils import is_tesla
 
 
 @triton.jit
@@ -176,7 +174,7 @@ def context_attention_fwd_fp8(
     softmax_scale,
 ):
 
-    BLOCK = 128 if not TESLA else 64
+    BLOCK = 128 if not is_tesla() else 64
     q_nope_dim = q_nope.shape[-1]
     q_rope_dim = q_rope.shape[-1]
     assert q_nope_dim == kv_nope.shape[-1]
@@ -185,9 +183,9 @@ def context_attention_fwd_fp8(
     assert q_rope_dim in {16, 32, 64, 128, 256}
 
     if q_nope_dim >= 512:
-        BLOCK = 32 if TESLA or CUDA_CAPABILITY[0] >= 9 else 64
+        BLOCK = 32 if is_tesla() or torch.cuda.get_device_capability()[0] >= 9 else 64
     else:
-        BLOCK = 128 if not TESLA else 64
+        BLOCK = 128 if not is_tesla() else 64
 
     if q_nope.dtype == torch.float32:
         BLOCK = BLOCK // 4
 
@@ -4,9 +4,7 @@
 import triton.language as tl
 import math
 import torch.nn.functional as F
-
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+from lightllm.utils.device_utils import is_tesla
 
 
 @triton.jit
@@ -148,7 +146,7 @@ def context_attention_fwd_with_v(
     softmax_scale,
 ):
 
-    BLOCK = 128 if not TESLA else 64
+    BLOCK = 128 if not is_tesla() else 64
     q_nope_dim = q_nope.shape[-1]
     q_rope_dim = q_rope.shape[-1]
     assert q_nope_dim == k_nope.shape[-1]
@@ -158,9 +156,9 @@ def context_attention_fwd_with_v(
     assert q_nope_dim == v.shape[-1]
 
     if q_nope_dim >= 512:
-        BLOCK = 64 if not TESLA else 32
+        BLOCK = 64 if not is_tesla() else 32
     else:
-        BLOCK = 128 if not TESLA else 64
+        BLOCK = 128 if not is_tesla() else 64
 
     if q_nope.dtype == torch.float32:
         BLOCK = BLOCK // 4
 
@@ -3,8 +3,7 @@
 import triton
 import triton.language as tl
 
-TESLA = "Tesla" in torch.cuda.get_device_name(0)
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+from lightllm.utils.device_utils import is_tesla
 
 
 @triton.jit
@@ -77,14 +76,14 @@ def sample_kv(
     kv_scale=None,
     k_scale=None,
 ):
-    BLOCK = 128 if not TESLA else 64
+    BLOCK = 128 if not is_tesla() else 64
 
     nope_dim = kv_nope.shape[-1]
     rope_dim = kv_rope.shape[-1]
     if nope_dim >= 512:
-        BLOCK = 64 if not TESLA else 32
+        BLOCK = 64 if not is_tesla() else 32
     else:
-        BLOCK = 128 if not TESLA else 64
+        BLOCK = 128 if not is_tesla() else 64
 
     batch = b_seq_len.shape[0]
 
 
@@ -48,7 +48,7 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
     def init_imageitem_extral_params(
         self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
     ):
-        if sampling_params.image_max_patch_num >= 0:
+        if sampling_params.image_max_patch_num > 0:
             img.extra_params["image_patch_max_num"] = sampling_params.image_max_patch_num
             return
         elif os.getenv("MAX_PATCH_NUM"):