fix set_items_embed (#1151)

hiworldwzj · wangzaijun · shihaobai · web-flow · commit e000ae8a35f6 · 2025-12-19T17:40:10.000+08:00
Co-authored-by: wangzaijun &lt;wangzaijun@sensetime.com&gt;
Co-authored-by: shihaobai &lt;1798930569@qq.com&gt;
diff --git a/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py
@@ -50,6 +50,7 @@ def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_w
         infer_state.img_start_locs_in_cache = torch.tensor(
             img_start_locs_in_cache, dtype=torch.long, device="cpu", pin_memory=True
         ).cuda(non_blocking=True)
+        infer_state.input_ids = input_ids
 
         multimodal_emb(
             out=out,
diff --git a/lightllm/models/qwen3_vl/triton_kernel/deepstack_multimodal_emb.py b/lightllm/models/qwen3_vl/triton_kernel/deepstack_multimodal_emb.py
@@ -102,9 +102,6 @@ def apply_deepstack_features(
     apply deepstack features for all images in qwen3-vl/qwen3-vl-moe
     """
 
-    if not infer_state.deepstack_features:
-        return
-
     deepstack_num_layers = infer_state.cpu_embed_cache_tensor.shape[1] - 1
 
     if layer_num >= deepstack_num_layers:
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
@@ -239,5 +239,5 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
             ids_to_set.append(uid)
 
         if ids_to_set:
-            torch.cuda.current_stream().synchronize()
             self.cache_client.root.set_items_embed(ids=ids_to_set)
+            torch.cuda.current_stream().synchronize()
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -120,8 +120,8 @@ def exposed_encode(self, images: List[ImageItem]):
                 )
                 ids_to_set.append(uid)
             if ids_to_set:
-                torch.cuda.current_stream().synchronize()
                 self.cache_client.root.set_items_embed(ids_to_set)
+                torch.cuda.current_stream().synchronize()
         return
 
 

Original file line number	Diff line number	Diff line change
`@@ -120,8 +120,8 @@ def exposed_encode(self, images: List[ImageItem]):`
`120`	`120`	`)`
`121`	`121`	`ids_to_set.append(uid)`
`122`	`122`	`if ids_to_set:`
`123`		`- torch.cuda.current_stream().synchronize()`
`124`	`123`	`self.cache_client.root.set_items_embed(ids_to_set)`
	`124`	`+ torch.cuda.current_stream().synchronize()`
`125`	`125`	`return`
`126`	`126`
`127`	`127`