fix multi_modal

wangzaijun · wangzaijun · commit 6b80d8aa48bd · 2025-12-23T03:21:32.000Z
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -829,6 +829,7 @@ def _check_max_len_infer(self):
                 is_prefill=True,
                 b_ready_cache_len=b_ready_cache_len,
                 b_prefill_start_loc=b_prefill_start_loc,
+                multimodal_params=[{"images": [], "audios": []}],
             )
             model_output = self.forward(
                 model_input,
@@ -905,7 +906,7 @@ def _autotune_warmup(self):
                     is_prefill=True,
                     b_ready_cache_len=b_ready_cache_len,
                     b_prefill_start_loc=b_prefill_start_loc,
-                    multimodal_params=[],
+                    multimodal_params=[{"images": [], "audios": []}],
                     **self._gen_special_model_input(total_token_num),
                 )
                 model_output = self.forward(
@@ -968,7 +969,7 @@ def _init_padded_req(self):
             b_ready_cache_len=b_ready_cache_len,
             b_prefill_start_loc=b_prefill_start_loc,
             is_prefill=True,
-            multimodal_params=[],
+            multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
             **self._gen_special_model_input(total_token_num),
         )
 
diff --git a/lightllm/common/basemodel/batch_objs.py b/lightllm/common/basemodel/batch_objs.py
@@ -73,6 +73,9 @@ def to_cuda(self):
             else:
                 self.b_shared_seq_len = self.b_shared_seq_len.cuda(non_blocking=True)
 
+    def __post_init__(self):
+        assert len(self.multimodal_params) == self.batch_size
+
 
 @dataclass
 class ModelOutput:
diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
@@ -216,6 +216,7 @@ def warmup(self, model):
                 b_seq_len=b_seq_len,
                 b_mtp_index=b_mtp_index,
                 is_prefill=False,
+                multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
                 **model._gen_special_model_input(batch_size),
             )
             model_output: ModelOutput = model.forward(model_input)
@@ -274,6 +275,7 @@ def warmup_overlap(self, model):
                     mem_indexes=mem_indexes,
                     b_req_idx=b_req_idx,
                     b_seq_len=b_seq_len,
+                    multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
                     **model._gen_special_model_input(batch_size),
                 )
                 decode_batches.append(micro_batch)
diff --git a/lightllm/common/basemodel/prefill_cuda_graph.py b/lightllm/common/basemodel/prefill_cuda_graph.py
@@ -182,6 +182,7 @@ def warmup(self, model):
                 is_prefill=True,
                 b_prefill_has_output_cpu=[False],
                 prefix_total_token_num=0,
+                multimodal_params=[{"images": [], "audios": []}],
                 **model._gen_special_model_input(token_num=total_token_num),
             )
             model_output: ModelOutput = model.forward(model_input)
@@ -242,6 +243,7 @@ def warmup_overlap(self, model):
                     is_prefill=True,
                     b_prefill_has_output_cpu=[False],
                     prefix_total_token_num=0,
+                    multimodal_params=[{"images": [], "audios": []}],
                     **model._gen_special_model_input(token_num=total_token_num),
                 )
 
diff --git a/test/benchmark/static_inference/model_infer.py b/test/benchmark/static_inference/model_infer.py
@@ -90,7 +90,7 @@ def overlap_prefill(
         b_seq_len=_0_b_seq_len,
         is_prefill=True,
         b_ready_cache_len=_o_b_ready_cache_len,
-        multimodal_params={},
+        multimodal_params=[{"images": [], "audios": []} for _ in range(_0_batch_size)],
         mem_indexes_cpu=_0_mem_indexes,
     )
 
@@ -114,7 +114,7 @@ def overlap_prefill(
         b_seq_len=_1_b_seq_len,
         is_prefill=True,
         b_ready_cache_len=_1_b_ready_cache_len,
-        multimodal_params={},
+        multimodal_params=[{"images": [], "audios": []} for _ in range(_1_batch_size)],
         mem_indexes_cpu=_1_mem_indexes,
     )
 
@@ -144,6 +144,7 @@ def overlap_decode(
         b_mtp_index=_0_b_mtp_index,
         b_seq_len=_0_b_seq_len,
         mem_indexes_cpu=_0_mem_indexes,
+        multimodal_params=[{"images": [], "audios": []} for _ in range(_0_batch_size)],
     )
 
     _1_batch_size = batch_size - batch_size // 2
@@ -164,6 +165,7 @@ def overlap_decode(
         b_mtp_index=_1_b_mtp_index,
         b_seq_len=_1_b_seq_len,
         mem_indexes_cpu=_1_mem_indexes,
+        multimodal_params=[{"images": [], "audios": []} for _ in range(_1_batch_size)],
     )
 
     output, output1 = model_part.microbatch_overlap_decode(micro_batch1, micro_batch2)
@@ -202,6 +204,7 @@ def prefill(
         b_ready_cache_len=b_ready_cache_len,  # b_ready_cache_len
         b_prefill_start_loc=b_prefill_start_loc,
         prefix_total_token_num=0,  # the default kvcache len is zero.
+        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
     )
 
     model_output = model_part.forward(model_input)
@@ -223,6 +226,7 @@ def decode(
         b_mtp_index=b_mtp_index,
         mem_indexes_cpu=mem_indexes,
         is_prefill=False,
+        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
     )
     model_output = model_part.forward(model_input)
     return model_output.logits
diff --git a/test/benchmark/static_inference/model_infer_mtp.py b/test/benchmark/static_inference/model_infer_mtp.py
@@ -136,6 +136,7 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
         b_seq_len=b_seq_len,
         is_prefill=True,
         b_ready_cache_len=b_ready_cache_len,
+        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
     )
 
     model_output: ModelOutput = main_model.forward(model_input)
@@ -202,6 +203,7 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
         b_req_idx=nopad_b_seq_idx,
         b_seq_len=nopad_b_seq_len,
         is_prefill=False,
+        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size * (len(draft_models) + 1))],
     )
 
     # Main decode

Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ def warmup(self, model):`
`182`	`182`	`is_prefill=True,`
`183`	`183`	`b_prefill_has_output_cpu=[False],`
`184`	`184`	`prefix_total_token_num=0,`
	`185`	`+ multimodal_params=[{"images": [], "audios": []}],`
`185`	`186`	`**model._gen_special_model_input(token_num=total_token_num),`
`186`	`187`	`)`
`187`	`188`	`model_output: ModelOutput = model.forward(model_input)`
`@@ -242,6 +243,7 @@ def warmup_overlap(self, model):`
`242`	`243`	`is_prefill=True,`
`243`	`244`	`b_prefill_has_output_cpu=[False],`
`244`	`245`	`prefix_total_token_num=0,`
	`246`	`+ multimodal_params=[{"images": [], "audios": []}],`
`245`	`247`	`**model._gen_special_model_input(token_num=total_token_num),`
`246`	`248`	`)`
`247`	`249`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_`
`136`	`136`	`b_seq_len=b_seq_len,`
`137`	`137`	`is_prefill=True,`
`138`	`138`	`b_ready_cache_len=b_ready_cache_len,`
	`139`	`+ multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],`
`139`	`140`	`)`
`140`	`141`
`141`	`142`	`model_output: ModelOutput = main_model.forward(model_input)`
`@@ -202,6 +203,7 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_`
`202`	`203`	`b_req_idx=nopad_b_seq_idx,`
`203`	`204`	`b_seq_len=nopad_b_seq_len,`
`204`	`205`	`is_prefill=False,`
	`206`	`+ multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size * (len(draft_models) + 1))],`
`205`	`207`	`)`
`206`	`208`
`207`	`209`	`# Main decode`