@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
4545 max_model_len = 4096 ,
4646 max_num_seqs = 2 ,
4747 dtype = "bfloat16" ,
48- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
48+ limit_mm_per_prompt = { "image" : 1 } ,
4949 )
5050
5151 prompts = [(f"<|im_start|>user\n <fim_prefix><|img|><fim_suffix>{ question } "
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
7171 max_model_len = 2048 ,
7272 max_num_seqs = 2 ,
7373 mm_processor_kwargs = {"crop_to_patches" : True },
74- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
74+ limit_mm_per_prompt = { "image" : 1 } ,
7575 )
7676 prompts = [
7777 f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{ question } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
9292 prompts = [f"Question: { question } Answer:" for question in questions ]
9393 engine_args = EngineArgs (
9494 model = "Salesforce/blip2-opt-6.7b" ,
95- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
95+ limit_mm_per_prompt = { "image" : 1 } ,
9696 )
9797
9898 return ModelRequestData (
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
110110 model = "facebook/chameleon-7b" ,
111111 max_model_len = 4096 ,
112112 max_num_seqs = 2 ,
113- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
113+ limit_mm_per_prompt = { "image" : 1 } ,
114114 )
115115
116116 return ModelRequestData (
@@ -129,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
129129 model = model_name ,
130130 max_model_len = 4096 ,
131131 max_num_seqs = 2 ,
132- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache ,
133132 hf_overrides = {"architectures" : ["DeepseekVLV2ForCausalLM" ]},
133+ limit_mm_per_prompt = {"image" : 1 },
134134 )
135135
136136 prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
155155 max_num_seqs = 2 ,
156156 trust_remote_code = True ,
157157 dtype = "bfloat16" ,
158- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
158+ limit_mm_per_prompt = { "image" : 1 } ,
159159 )
160160
161161 prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions ]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
175175 model = "adept/fuyu-8b" ,
176176 max_model_len = 2048 ,
177177 max_num_seqs = 2 ,
178- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
178+ limit_mm_per_prompt = { "image" : 1 } ,
179179 )
180180
181181 return ModelRequestData (
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
194194 max_model_len = 2048 ,
195195 max_num_seqs = 2 ,
196196 mm_processor_kwargs = {"do_pan_and_scan" : True },
197- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
197+ limit_mm_per_prompt = { "image" : 1 } ,
198198 )
199199
200200 prompts = [("<bos><start_of_turn>user\n "
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
219219 trust_remote_code = True ,
220220 enforce_eager = True ,
221221 hf_overrides = {"architectures" : ["GLM4VForCausalLM" ]},
222- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
222+ limit_mm_per_prompt = { "image" : 1 } ,
223223 )
224224
225225 prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
246246 model = model_name ,
247247 trust_remote_code = True ,
248248 max_model_len = 8192 ,
249- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
249+ limit_mm_per_prompt = { "image" : 1 } ,
250250 )
251251
252252 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
287287 "longest_edge" : 3 * 364
288288 },
289289 },
290- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
290+ limit_mm_per_prompt = { "image" : 1 } ,
291291 )
292292 prompts = [(
293293 f"<|begin_of_text|>User:<image>{ question } <end_of_utterance>\n Assistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
314314 "longest_edge" : 384
315315 },
316316 },
317- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
317+ limit_mm_per_prompt = { "image" : 1 } ,
318318 )
319319 prompts = [
320320 (f"<|im_start|>User:<image>{ question } <end_of_utterance>\n Assistant:" )
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
337337 model = model_name ,
338338 trust_remote_code = True ,
339339 max_model_len = 4096 ,
340- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
340+ limit_mm_per_prompt = { "image" : 1 } ,
341341 )
342342
343343 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -375,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
375375 engine_args = EngineArgs (
376376 model = "llava-hf/llava-1.5-7b-hf" ,
377377 max_model_len = 4096 ,
378- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
378+ limit_mm_per_prompt = { "image" : 1 } ,
379379 )
380380
381381 return ModelRequestData (
@@ -392,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
392392 engine_args = EngineArgs (
393393 model = "llava-hf/llava-v1.6-mistral-7b-hf" ,
394394 max_model_len = 8192 ,
395- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
395+ limit_mm_per_prompt = { "image" : 1 } ,
396396 )
397397
398398 return ModelRequestData (
@@ -414,7 +414,7 @@ def run_llava_next_video(questions: list[str],
414414 model = "llava-hf/LLaVA-NeXT-Video-7B-hf" ,
415415 max_model_len = 8192 ,
416416 max_num_seqs = 2 ,
417- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
417+ limit_mm_per_prompt = { "image" : 1 } ,
418418 )
419419
420420 return ModelRequestData (
@@ -442,7 +442,7 @@ def run_llava_onevision(questions: list[str],
442442 engine_args = EngineArgs (
443443 model = "llava-hf/llava-onevision-qwen2-7b-ov-hf" ,
444444 max_model_len = 16384 ,
445- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
445+ limit_mm_per_prompt = { "image" : 1 } ,
446446 )
447447
448448 return ModelRequestData (
@@ -465,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
465465 model = "TIGER-Lab/Mantis-8B-siglip-llama3" ,
466466 max_model_len = 4096 ,
467467 hf_overrides = {"architectures" : ["MantisForConditionalGeneration" ]},
468- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
468+ limit_mm_per_prompt = { "image" : 1 } ,
469469 )
470470 stop_token_ids = [128009 ]
471471
@@ -506,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
506506 max_model_len = 4096 ,
507507 max_num_seqs = 2 ,
508508 trust_remote_code = True ,
509- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
509+ limit_mm_per_prompt = { "image" : 1 } ,
510510 )
511511 # NOTE The stop_token_ids are different for various versions of MiniCPM-V
512512 # 2.0
@@ -561,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
561561 max_model_len = 8192 ,
562562 max_num_seqs = 2 ,
563563 tensor_parallel_size = 2 ,
564- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
564+ limit_mm_per_prompt = { "image" : 1 } ,
565565 )
566566
567567 prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -587,7 +587,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
587587 model = model_name ,
588588 max_model_len = 8192 ,
589589 max_num_seqs = 2 ,
590- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
590+ limit_mm_per_prompt = { "image" : 1 } ,
591591 )
592592
593593 tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -611,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
611611 )
612612
613613
614- def run_llama4 (questions : list [str ], modality : str ):
614+ def run_llama4 (questions : list [str ], modality : str ) -> ModelRequestData :
615615 assert modality == "image"
616616
617617 model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -621,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
621621 max_model_len = 8192 ,
622622 max_num_seqs = 4 ,
623623 tensor_parallel_size = 8 ,
624- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache ,
625624 gpu_memory_utilization = 0.4 ,
625+ limit_mm_per_prompt = {"image" : 1 },
626626 )
627627
628628 tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -657,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
657657 model = model_name ,
658658 trust_remote_code = True ,
659659 dtype = "bfloat16" ,
660- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
660+ limit_mm_per_prompt = { "image" : 1 } ,
661661 )
662662
663663 prompts = [
@@ -683,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
683683 trust_remote_code = True ,
684684 max_model_len = 4096 ,
685685 tensor_parallel_size = 4 ,
686- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
686+ limit_mm_per_prompt = { "image" : 1 } ,
687687 )
688688
689689 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -710,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
710710 prompts = ["caption en" for _ in questions ]
711711 engine_args = EngineArgs (
712712 model = "google/paligemma-3b-mix-224" ,
713- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache )
713+ limit_mm_per_prompt = {"image" : 1 },
714+ )
714715
715716 return ModelRequestData (
716717 engine_args = engine_args ,
@@ -726,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
726727 prompts = ["caption en" for _ in questions ]
727728 engine_args = EngineArgs (
728729 model = "google/paligemma2-3b-ft-docci-448" ,
729- disable_mm_preprocessor_cache = args .disable_mm_preprocessor_cache )
730+ limit_mm_per_prompt = {"image" : 1 },
731+ )
730732
731733 return ModelRequestData (
732734 engine_args = engine_args ,
@@ -762,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
762764 max_num_seqs = 2 ,
763765 # Note - mm_processor_kwargs can also be passed to generate/chat calls
764766 mm_processor_kwargs = {"num_crops" : 16 },
765- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
767+ limit_mm_per_prompt = { "image" : 1 } ,
766768 )
767769
768770 return ModelRequestData (
@@ -793,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
793795 max_num_seqs = 2 ,
794796 enable_lora = True ,
795797 max_lora_rank = 320 ,
798+ limit_mm_per_prompt = {"image" : 1 },
796799 )
797800
798801 return ModelRequestData (
@@ -813,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
813816 model = model_name ,
814817 max_model_len = 6144 ,
815818 max_num_seqs = 2 ,
816- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
819+ limit_mm_per_prompt = { "image" : 1 } ,
817820 )
818821
819822 prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -834,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
834837 max_model_len = 1024 ,
835838 max_num_seqs = 2 ,
836839 hf_overrides = {"architectures" : ["QwenVLForConditionalGeneration" ]},
837- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
840+ limit_mm_per_prompt = { "image" : 1 } ,
838841 )
839842
840843 prompts = [f"{ question } Picture 1: <img></img>\n " for question in questions ]
@@ -859,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
859862 "min_pixels" : 28 * 28 ,
860863 "max_pixels" : 1280 * 28 * 28 ,
861864 },
862- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
865+ limit_mm_per_prompt = { "image" : 1 } ,
863866 )
864867
865868 if modality == "image" :
@@ -894,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
894897 "max_pixels" : 1280 * 28 * 28 ,
895898 "fps" : 1 ,
896899 },
897- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
900+ limit_mm_per_prompt = { "image" : 1 } ,
898901 )
899902
900903 if modality == "image" :
@@ -925,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
925928 model = model_name ,
926929 trust_remote_code = True ,
927930 max_model_len = 4096 ,
928- disable_mm_preprocessor_cache = args . disable_mm_preprocessor_cache ,
931+ limit_mm_per_prompt = { "image" : 1 } ,
929932 )
930933
931934 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -1082,7 +1085,15 @@ def main(args):
10821085
10831086 req_data = model_example_map [model ](questions , modality )
10841087
1085- engine_args = asdict (req_data .engine_args ) | {"seed" : args .seed }
1088+ # Disable other modalities to save memory
1089+ default_limits = {"image" : 0 , "video" : 0 , "audio" : 0 }
1090+ req_data .engine_args .limit_mm_per_prompt = default_limits | dict (
1091+ req_data .engine_args .limit_mm_per_prompt or {})
1092+
1093+ engine_args = asdict (req_data .engine_args ) | {
1094+ "seed" : args .seed ,
1095+ "disable_mm_preprocessor_cache" : args .disable_mm_preprocessor_cache ,
1096+ }
10861097 llm = LLM (** engine_args )
10871098
10881099 # To maintain code compatibility in this script, we add LoRA here.
0 commit comments