1210

sangchengmeng · sangchengmeng · commit 02486eb8dde6 · 2025-12-10T09:46:16.000Z
diff --git a/lightllm/models/qwen3_vl/infer_struct.py b/lightllm/models/qwen3_vl/infer_struct.py
@@ -7,9 +7,12 @@
 class Qwen3VLInferStateInfo(LlamaInferStateInfo):
     def __init__(self):
         super().__init__()
+        self.input_ids = None
         self.deepstack_features = []
-        self.img_first_token_locs = []
-        self.img_last_token_locs = []
+        self.deepstack_end_layer = None
+        self.img_start_token_ids = []
+        self.img_token_lens = []
+        self.img_start_locs = []
 
     def apply_interleaved_mrope(self, freqs, mrope_section):
         """Apply interleaved MRoPE to 3D rotary embeddings.
diff --git a/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/pre_layer_infer.py
@@ -3,6 +3,7 @@
 
 from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+from lightllm.models.qwen3_vl.infer_struct import Qwen3VLInferStateInfo
 
 from lightllm.server.embed_cache.utils import (
     bytes2tensor,
@@ -20,13 +21,14 @@ def __init__(self, network_config, mode):
         super().__init__(network_config, mode)
         return
 
-    def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
-
+    def context_forward(self, input_ids, infer_state: Qwen3VLInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
         img_weight = []
-        img_start_token_ids = []
-        img_token_lens = []
         img_start_loc = 0
-        img_start_locs = []
+
+        infer_state.input_ids = input_ids
+        infer_state.img_start_token_ids = []
+        infer_state.img_token_lens = []
+        infer_state.img_start_locs = []
 
         device = layer_weight.wte_weight_.device
         dtype = layer_weight.wte_weight_.dtype
@@ -37,12 +39,9 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         for batch_id, p in enumerate(infer_state.multimodal_params):
             for img in p["images"] + p["audios"]:
                 # skip the same image
-                if img["token_id"] in img_start_token_ids or img["_prefill_"] is False:
+                if img["token_id"] in infer_state.img_start_token_ids or img["_prefill_"] is False:
                     continue
-                pos = (input_ids == img["token_id"]).nonzero(as_tuple=True)
-                if pos[0].numel() == 0:
-                    continue
-                # pull the img_embeds by uid from shm
+
                 all_img_embed_df = bytes2tensor(read_shm(get_shm_name_embed(img["uuid"])))
                 per_image_deepstack = []
 
@@ -55,12 +54,9 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
                     per_image_deepstack.append(all_img_embed_df[start:end])
 
                 infer_state.deepstack_features.append(per_image_deepstack)
-                img_insert_locs = int(pos[0][0])
-                infer_state.img_first_token_locs.append(img_insert_locs)
-                infer_state.img_last_token_locs.append(img_insert_locs + img["token_num"])
-                img_start_token_ids.append(img["token_id"])
-                img_token_lens.append(img["token_num"])
-                img_start_locs.append(img_start_loc)
+                infer_state.img_start_token_ids.append(img["token_id"])
+                infer_state.img_token_lens.append(img["token_num"])
+                infer_state.img_start_locs.append(img_start_loc)
                 img_start_loc += img["token_num"]
         out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
 
@@ -74,9 +70,9 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         )
         # each tp will fill the img embeds, should divide by world_size
         img_weight = img_weight / self.tp_world_size_
-        img_start_token_ids = torch.Tensor(img_start_token_ids).to(device=device, dtype=torch.long)
-        img_token_lens = torch.Tensor(img_token_lens).to(device=device, dtype=torch.long)
-        img_start_locs = torch.Tensor(img_start_locs).to(device=device, dtype=torch.long)
+        img_start_token_ids = torch.Tensor(infer_state.img_start_token_ids).to(device=device, dtype=torch.long)
+        img_token_lens = torch.Tensor(infer_state.img_token_lens).to(device=device, dtype=torch.long)
+        img_start_locs = torch.Tensor(infer_state.img_start_locs).to(device=device, dtype=torch.long)
 
         multimodal_emb(
             out,
diff --git a/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
@@ -16,6 +16,7 @@
 from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd
 from lightllm.distributed import all_reduce
 from lightllm.utils.dist_utils import get_global_world_size
+from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
 
 
 class Qwen3VLTransformerLayerInfer(Qwen3TransformerLayerInfer):
@@ -46,24 +47,9 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
-        if infer_state.deepstack_features:
-            for i in range(len(infer_state.img_first_token_locs)):
-                start = infer_state.img_first_token_locs[i]
-                end = infer_state.img_last_token_locs[i]
-                deepstack_features = infer_state.deepstack_features[i]
-                if end <= input_embdings.shape[0] and self.layer_num_ in range(len(deepstack_features)):
-                    deepstack_features_cur_layer = deepstack_features[self.layer_num_].to(
-                        device=input_embdings.device, non_blocking=True
-                    )
-                    print(
-                        f"self.layer_num_ is {self.layer_num_}, i is{i} ,"
-                        f"deepstack_features_cur_layer is {deepstack_features_cur_layer}"
-                    )
-                    input_embdings[
-                        start:end,
-                    ].add_(deepstack_features_cur_layer)
-            if self.layer_num_ == len(deepstack_features):
-                infer_state.img_first_token_locs = []
-                infer_state.img_last_token_locs = []
-                infer_state.deepstack_features = []
+        apply_deepstack_features(
+            input_embeddings=input_embdings,
+            infer_state=infer_state,
+            layer_num=self.layer_num_,
+        )
         return input_embdings
diff --git a/lightllm/models/qwen3_vl/model.py b/lightllm/models/qwen3_vl/model.py
@@ -62,7 +62,6 @@ def get_image_token_length(self, img: ImageItem):
         )
         grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
         token_num = (grid_h * grid_w) // (self.merge_size ** 2)
-        print(f"token_num is {token_num}")
         return token_num
 
     def get_audio_token_length(self, audio: AudioItem):
diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py
@@ -68,13 +68,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(
             -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
         )
-        # num_patches = hidden_states.shape[0]
-        # print(f"num_patches is {num_patches}")
-        # torch.cuda.synchronize()
-        # time0 = time.perf_counter()
         hidden_states = self.proj(hidden_states).view(-1, self.embed_dim)
-        # torch.cuda.synchronize()
-        # print(f"patch embed time is {time.perf_counter()-time0}")
         return hidden_states
 
 
@@ -385,7 +379,6 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs)
                     hidden_states
                 )
                 deepstack_feature_lists.append(deepstack_feature)
-                # print(f"ds time is {time.perf_counter()-time0}")
 
         hidden_states = self.merger(hidden_states)
 
diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
@@ -16,6 +16,7 @@
 from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd
 from lightllm.distributed import all_reduce
 from lightllm.utils.dist_utils import get_global_world_size
+from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
 
 
 class Qwen3VLMOETransformerLayerInfer(Qwen3MOETransformerLayerInfer):
@@ -75,20 +76,9 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
-        if infer_state.deepstack_features:
-            for i in range(len(infer_state.img_first_token_locs)):
-                start = infer_state.img_first_token_locs[i]
-                end = infer_state.img_last_token_locs[i]
-                deepstack_features = infer_state.deepstack_features[i]
-                if end <= input_embdings.shape[0] and self.layer_num_ in range(len(deepstack_features)):
-                    deepstack_features_cur_layer = deepstack_features[self.layer_num_].to(
-                        device=input_embdings.device, non_blocking=True
-                    )
-                    input_embdings[
-                        start:end,
-                    ].add_(deepstack_features_cur_layer)
-            if self.layer_num_ == len(deepstack_features):
-                infer_state.img_first_token_locs = []
-                infer_state.img_last_token_locs = []
-                infer_state.deepstack_features = []
+        apply_deepstack_features(
+            input_embeddings=input_embdings,
+            infer_state=infer_state,
+            layer_num=self.layer_num_,
+        )
         return input_embdings

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,6 @@ def get_image_token_length(self, img: ImageItem):`
`62`	`62`	`)`
`63`	`63`	`grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size`
`64`	`64`	`token_num = (grid_h * grid_w) // (self.merge_size ** 2)`
`65`		`- print(f"token_num is {token_num}")`
`66`	`65`	`return token_num`
`67`	`66`
`68`	`67`	`def get_audio_token_length(self, audio: AudioItem):`