fix pre layer infer

hiworldwzj · hiworldwzj · commit f4233b63ec29 · 2025-12-18T23:52:35.000+08:00
diff --git a/lightllm/common/basemodel/infer_struct.py b/lightllm/common/basemodel/infer_struct.py
@@ -122,27 +122,6 @@ def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
                     attr_.copy_(attr_value, non_blocking=True)
         return
 
-    def mark_multimodal_objs_for_prefill(self, input_ids: torch.Tensor):
-        """
-        功能函数，用于标记在chuncked prefill的过程中，到底哪些多模态对象对应的token是需要参与计算的。
-        因为分chunck的原因，并不是所有的多模态对象对应的token都需要参与计算。
-        """
-        multi_objs = []
-        for _, p in enumerate(self.multimodal_params):
-            for obj in p["images"] + p["audios"]:
-                multi_objs.append(obj)
-
-        if multi_objs:
-            obj_start_ids = torch.tensor([e["token_id"] for e in multi_objs], dtype=torch.int64, device="cuda")
-            obj_token_lens = torch.tensor([e["token_num"] for e in multi_objs], dtype=torch.int64, device="cuda")
-            marks = mark_multimodal_obj(
-                obj_start_token_ids=obj_start_ids, obj_token_lens=obj_token_lens, input_ids=input_ids
-            )
-            marks_array = marks.detach().cpu().numpy()
-            for mark, obj in zip(marks_array, multi_objs):
-                obj["_prefill_"] = mark > 0
-        return
-
     def prefill_dp_balance(self, input_ids: torch.Tensor):
         """
         在prefill的时候, 对于处于 dp 模式下的时候，对输入的数据进行重新的调整和分配，降低各个dp处理数据量过于不一致的时候,导致
diff --git a/lightllm/common/basemodel/triton_kernel/multimodal_emb.py b/lightllm/common/basemodel/triton_kernel/multimodal_emb.py
@@ -7,20 +7,22 @@
 def _fwd_kernel(
     Prompt_ids,
     Text_weight_embs,
-    Img_embs,
+    Embed_cache,
     Out,
     Img_token_lens,
     Img_start_token_ids,
-    Img_start_locs,
+    Img_start_locs_in_cache,
     stride_text_emb_s,
     stride_text_emb_d,  # text_stride
-    stride_img_emb_s,
-    stride_img_emb_d,  # img_stride
+    stride_emb_cache_s,
+    stride_emb_cache_l,
+    stride_emb_cache_d,  # img_stride
     stride_out_s,
     stride_out_d,
     tp_text_start_token_id,
     tp_text_end_token_id,
     hidden_size,
+    tp_world_size,
     BLOCK_HIDDEN_DIM: tl.constexpr,
 ):
 
@@ -44,7 +46,7 @@ def _fwd_kernel(
         tl.store(Out + stride_out_s * seq_index + stride_out_d * off_d, load_emb, mask=off_d < hidden_size)
 
     img_start_token_id = tl.load(Img_start_token_ids + img_handle_id - 1, mask=img_handle_id >= 1, other=0)
-    img_start_loc = tl.load(Img_start_locs + img_handle_id - 1, mask=img_handle_id >= 1, other=0)
+    img_start_loc = tl.load(Img_start_locs_in_cache + img_handle_id - 1, mask=img_handle_id >= 1, other=0)
     img_token_len = tl.load(Img_token_lens + img_handle_id - 1, mask=img_handle_id >= 1, other=0)
     # load store img emb
     for _ in range(
@@ -57,11 +59,16 @@ def _fwd_kernel(
         1,
     ):
         load_emb = tl.load(
-            Img_embs + stride_img_emb_s * (img_start_loc + token_id - img_start_token_id) + off_d * stride_img_emb_d,
+            Embed_cache
+            + stride_emb_cache_s.to(tl.int64) * (img_start_loc + token_id - img_start_token_id)
+            + stride_emb_cache_l * 0
+            + stride_emb_cache_d * off_d,
             mask=off_d < hidden_size,
             other=0,
         )
-        tl.store(Out + stride_out_s * seq_index + stride_out_d * off_d, load_emb, mask=off_d < hidden_size)
+        tl.store(
+            Out + stride_out_s * seq_index + stride_out_d * off_d, load_emb / tp_world_size, mask=off_d < hidden_size
+        )
     return
 
 
@@ -70,35 +77,38 @@ def multimodal_emb(
     out: torch.Tensor,
     prompt_ids: torch.Tensor,
     text_weight_embs: torch.Tensor,
-    img_embs: torch.Tensor,
+    embed_cache: torch.Tensor,
     img_token_lens: torch.Tensor,
     img_start_token_ids: torch.Tensor,
-    img_start_locs: torch.Tensor,
-    tp_text_start_token_id,
-    tp_text_end_token_id,
+    img_start_locs_in_cache: torch.Tensor,
+    tp_text_start_token_id: int,
+    tp_text_end_token_id: int,
+    tp_world_size: int,
 ):
     total_len = prompt_ids.shape[0]
     BLOCK = triton.next_power_of_2(out.shape[1])
     # print(len(img_token_lens))
     grid = (total_len, len(img_token_lens) + 1)
     num_warps = 1
     _fwd_kernel[grid](
-        prompt_ids,
-        text_weight_embs,
-        img_embs,
-        out,
-        img_token_lens,
-        img_start_token_ids,
-        img_start_locs,
-        text_weight_embs.stride(0),
-        text_weight_embs.stride(1),
-        img_embs.stride(0),
-        img_embs.stride(1),
-        out.stride(0),
-        out.stride(1),
-        tp_text_start_token_id,
-        tp_text_end_token_id,
+        Prompt_ids=prompt_ids,
+        Text_weight_embs=text_weight_embs,
+        Embed_cache=embed_cache,
+        Out=out,
+        Img_token_lens=img_token_lens,
+        Img_start_token_ids=img_start_token_ids,
+        Img_start_locs_in_cache=img_start_locs_in_cache,
+        stride_text_emb_s=text_weight_embs.stride(0),
+        stride_text_emb_d=text_weight_embs.stride(1),
+        stride_emb_cache_s=embed_cache.stride(0),
+        stride_emb_cache_l=embed_cache.stride(1),
+        stride_emb_cache_d=embed_cache.stride(2),
+        stride_out_s=out.stride(0),
+        stride_out_d=out.stride(1),
+        tp_text_start_token_id=tp_text_start_token_id,
+        tp_text_end_token_id=tp_text_end_token_id,
         hidden_size=out.shape[1],
+        tp_world_size=float(tp_world_size),
         BLOCK_HIDDEN_DIM=BLOCK,
         num_warps=num_warps,
         num_stages=1,
diff --git a/lightllm/models/gemma3/layer_infer/pre_layer_infer.py b/lightllm/models/gemma3/layer_infer/pre_layer_infer.py
@@ -14,16 +14,15 @@ def __init__(self, network_config, mode):
         return
 
     def context_forward(self, input_ids, infer_state, layer_weight):
-        img_weight = []
         img_start_token_ids = []
         img_token_lens = []
-        img_start_loc = 0
-        img_start_locs = []
+        img_start_locs_in_cache = []
         device = layer_weight.wte_weight_.device
         dtype = layer_weight.wte_weight_.dtype
         hidden_size = layer_weight.wte_weight_.shape[1]
         weight_mask = torch.zeros((len(input_ids)), dtype=torch.float32, device=device)
 
+        # TODO
         scale = self.embed_scale
         for idx, input_id in enumerate(input_ids):
             if input_id == self.boi_token_index:
@@ -35,45 +34,40 @@ def context_forward(self, input_ids, infer_state, layer_weight):
             else:
                 weight_mask[idx] = scale
 
-        infer_state.mark_multimodal_objs_for_prefill(input_ids=input_ids)
-
         for batch_id, p in enumerate(infer_state.multimodal_params):
             for img in p["images"]:
                 # skip the same image
-                if img["token_id"] in img_start_token_ids or img["_prefill_"] is False:
+                if img["token_id"] in img_start_token_ids:
                     continue
-                # pull the img_embeds by uid from shm
-                data = read_shm(get_shm_name_embed(img["uuid"]))
-                img_weight.append(bytes2tensor(data).view(dtype).view(img["token_num"], -1).cuda(non_blocking=True))
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
-                img_start_locs.append(img_start_loc)
-                img_start_loc += img["token_num"]
+                img_start_locs_in_cache.append(img["start_index_in_embed_cache"])
         out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
-        if len(img_weight) > 0:
-            img_weight = torch.cat(img_weight, dim=0).to(device=device, dtype=dtype)
-        else:
-            img_weight = torch.empty((0, hidden_size), device=device, dtype=dtype)
-        assert img_weight.shape[1] == hidden_size, (
+
+        from lightllm.server.router.model_infer.infer_batch import g_infer_context
+
+        cpu_embed_cache_tensor = g_infer_context.cpu_embed_cache_client.cpu_embed_cache_tensor
+
+        assert cpu_embed_cache_tensor.shape[2] == hidden_size, (
             f"Dimension mismatch: text weight dimension is {hidden_size}, "
-            f"but image weight dimension is {img_weight.shape[1]}"
+            f"but image embed dimension is {cpu_embed_cache_tensor.shape[2]}"
         )
         # each tp will fill the img embeds, should divide by world_size
-        img_weight = img_weight / self.tp_world_size_
         img_start_token_ids = torch.Tensor(img_start_token_ids).to(device=device, dtype=torch.long)
         img_token_lens = torch.Tensor(img_token_lens).to(device=device, dtype=torch.long)
-        img_start_locs = torch.Tensor(img_start_locs).to(device=device, dtype=torch.long)
+        img_start_locs_in_cache = torch.Tensor(img_start_locs_in_cache).to(device=device, dtype=torch.long)
 
         multimodal_emb(
-            out,
-            input_ids,
-            layer_weight.wte_weight_,
-            img_weight,
-            img_token_lens,
-            img_start_token_ids,
-            img_start_locs,
-            self.vob_start_id_,
-            self.vob_end_id_,
+            out=out,
+            prompt_ids=input_ids,
+            text_weight_embs=layer_weight.wte_weight_,
+            embed_cache=cpu_embed_cache_tensor,
+            img_token_lens=img_token_lens,
+            img_start_token_ids=img_start_token_ids,
+            img_start_locs_in_cache=img_start_locs_in_cache,
+            tp_text_start_token_id=self.vob_start_id_,
+            tp_text_end_token_id=self.vob_end_id_,
+            tp_world_size=self.tp_world_size_,
         )
         input_dtype = out.dtype
         if self.tp_world_size_ > 1:
diff --git a/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py
@@ -34,8 +34,6 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
         dtype = layer_weight.wte_weight_.dtype
         hidden_size = layer_weight.wte_weight_.shape[1]
 
-        infer_state.mark_multimodal_objs_for_prefill(input_ids=input_ids)
-
         for batch_id, p in enumerate(infer_state.multimodal_params):
             for img in p["images"] + p["audios"]:
                 # skip the same image
diff --git a/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py
@@ -32,56 +32,47 @@ def __init__(self, network_config, mode):
         return
 
     def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
-
-        img_weight = []
         img_start_token_ids = []
         img_token_lens = []
-        img_start_loc = 0
-        img_start_locs = []
-
+        img_start_locs_in_cache = []
         device = layer_weight.wte_weight_.device
         dtype = layer_weight.wte_weight_.dtype
         hidden_size = layer_weight.wte_weight_.shape[1]
 
-        infer_state.mark_multimodal_objs_for_prefill(input_ids=input_ids)
-
         for batch_id, p in enumerate(infer_state.multimodal_params):
             for img in p["images"] + p["audios"]:
                 # skip the same image
-                if img["token_id"] in img_start_token_ids or img["_prefill_"] is False:
+                if img["token_id"] in img_start_token_ids:
                     continue
-                # pull the img_embeds by uid from shm
-                data = read_shm(get_shm_name_embed(img["uuid"]))
-                img_weight.append(bytes2tensor(data).view(dtype).view(img["token_num"], -1).cuda(non_blocking=True))
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
-                img_start_locs.append(img_start_loc)
-                img_start_loc += img["token_num"]
+                img_start_locs_in_cache.append(img["start_index_in_embed_cache"])
         out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
-        if len(img_weight) > 0:
-            img_weight = torch.cat(img_weight, dim=0).to(device=device, dtype=dtype)
-        else:
-            img_weight = torch.empty((0, hidden_size), device=device, dtype=dtype)
-        assert img_weight.shape[1] == hidden_size, (
+
+        from lightllm.server.router.model_infer.infer_batch import g_infer_context
+
+        cpu_embed_cache_tensor = g_infer_context.cpu_embed_cache_client.cpu_embed_cache_tensor
+
+        assert cpu_embed_cache_tensor.shape[2] == hidden_size, (
             f"Dimension mismatch: text weight dimension is {hidden_size}, "
-            f"but image weight dimension is {img_weight.shape[1]}"
+            f"but image embed dimension is {cpu_embed_cache_tensor.shape[2]}"
         )
         # each tp will fill the img embeds, should divide by world_size
-        img_weight = img_weight / self.tp_world_size_
         img_start_token_ids = torch.Tensor(img_start_token_ids).to(device=device, dtype=torch.long)
         img_token_lens = torch.Tensor(img_token_lens).to(device=device, dtype=torch.long)
-        img_start_locs = torch.Tensor(img_start_locs).to(device=device, dtype=torch.long)
+        img_start_locs_in_cache = torch.Tensor(img_start_locs_in_cache).to(device=device, dtype=torch.long)
 
         multimodal_emb(
-            out,
-            input_ids,
-            layer_weight.wte_weight_,
-            img_weight,
-            img_token_lens,
-            img_start_token_ids,
-            img_start_locs,
-            self.vob_start_id_,
-            self.vob_end_id_,
+            out=out,
+            prompt_ids=input_ids,
+            text_weight_embs=layer_weight.wte_weight_,
+            embed_cache=cpu_embed_cache_tensor,
+            img_token_lens=img_token_lens,
+            img_start_token_ids=img_start_token_ids,
+            img_start_locs_in_cache=img_start_locs_in_cache,
+            tp_text_start_token_id=self.vob_start_id_,
+            tp_text_end_token_id=self.vob_end_id_,
+            tp_world_size=self.tp_world_size_,
         )
         if self.tp_world_size_ > 1:
             all_reduce(out, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)