@@ -555,15 +555,15 @@ def test_gemma_2b(self):
555555
556556 def test_llama2_7b_chat (self ):
557557 with Runner ('lmi' , 'llama2-7b-chat' ) as r :
558- prepare .build_vllm_model ("llama2-7b-chat" )
558+ prepare .build_vllm_async_model ("llama2-7b-chat" )
559559 r .launch ()
560560 client .run ("vllm_chat llama2-7b-chat" .split ())
561561
562562 @pytest .mark .skipif (not is_applicable_cuda_capability (89 ),
563563 reason = "Unsupported CUDA capability" )
564564 def test_qwen2_7b_fp8 (self ):
565565 with Runner ('lmi' , 'qwen2-7b-fp8' ) as r :
566- prepare .build_vllm_model ("qwen2-7b-fp8" )
566+ prepare .build_vllm_async_model ("qwen2-7b-fp8" )
567567 r .launch ()
568568 client .run ("vllm qwen2-7b-fp8" .split ())
569569
@@ -576,7 +576,7 @@ def test_llama3_8b_chunked_prefill(self):
576576
577577 def test_falcon_11b_chunked_prefill (self ):
578578 with Runner ('lmi' , 'falcon-11b-chunked-prefill' ) as r :
579- prepare .build_vllm_model ("falcon-11b-chunked-prefill" )
579+ prepare .build_vllm_async_model ("falcon-11b-chunked-prefill" )
580580 r .launch ()
581581 client .run (
582582 "vllm falcon-11b-chunked-prefill --in_tokens 1200" .split ())
@@ -589,31 +589,31 @@ def test_llama_68m_speculative_medusa(self):
589589
590590 def test_llama_68m_speculative_eagle (self ):
591591 with Runner ('lmi' , 'llama-68m-speculative-eagle' ) as r :
592- prepare .build_vllm_model ("llama-68m-speculative-eagle" )
592+ prepare .build_vllm_async_model ("llama-68m-speculative-eagle" )
593593 r .launch ()
594594 client .run ("vllm llama-68m-speculative-eagle" .split ())
595595
596596 def test_llama3_1_8b_instruct_tool (self ):
597597 with Runner ('lmi' , 'llama3-1-8b-instruct-tool' ) as r :
598- prepare .build_vllm_model ("llama3-1-8b-instruct-tool" )
598+ prepare .build_vllm_async_model ("llama3-1-8b-instruct-tool" )
599599 r .launch ()
600600 client .run ("vllm_tool llama3-1-8b-instruct-tool" .split ())
601601
602602 def test_mistral_7b_instruct_v03_tool (self ):
603603 with Runner ('lmi' , 'mistral-7b-instruct-v03-tool' ) as r :
604- prepare .build_vllm_model ("mistral-7b-instruct-v03-tool" )
604+ prepare .build_vllm_async_model ("mistral-7b-instruct-v03-tool" )
605605 r .launch ()
606606 client .run ("vllm_tool mistral-7b-instruct-v03-tool" .split ())
607607
608608 def test_deepseek_r1_distill_qwen_1_5b (self ):
609609 with Runner ('lmi' , 'deepseek-r1-distill-qwen-1-5b' ) as r :
610- prepare .build_vllm_model ("deepseek-r1-distill-qwen-1-5b" )
610+ prepare .build_vllm_async_model ("deepseek-r1-distill-qwen-1-5b" )
611611 r .launch ()
612612 client .run ("vllm_chat deepseek-r1-distill-qwen-1-5b" .split ())
613613
614614 def test_tiny_llama_input_length_exceeded (self ):
615615 with Runner ('lmi' , 'tinyllama-test-input-length-exceeded' ) as r :
616- prepare .build_vllm_model ("tinyllama-input-len-exceeded" )
616+ prepare .build_vllm_async_model ("tinyllama-input-len-exceeded" )
617617 r .launch ()
618618 start = time .perf_counter ()
619619 with pytest .raises (ValueError , match = r".*424.*" ):
@@ -1020,16 +1020,16 @@ def test_phi3_v(self):
10201020
10211021 def test_pixtral_12b (self ):
10221022 with Runner ('lmi' , 'pixtral-12b' ) as r :
1023- prepare .build_vllm_model ('pixtral-12b' )
1023+ prepare .build_vllm_async_model ('pixtral-12b' )
10241024 r .launch ()
10251025 client .run ("multimodal pixtral-12b" .split ())
10261026
1027- # MLlama is only supported by vllm backend currently
1028- def test_mllama_11b (self ):
1029- with Runner ('lmi' , 'llama32-11b-multimodal' ) as r :
1030- prepare .build_vllm_model ('llama32-11b-multimodal' )
1031- r .launch ()
1032- client .run ("multimodal llama32-11b-multimodal" .split ())
1027+ # MLlama is not supported in vllm v1, see https://github.com/vllm-project/vllm/issues/27198
1028+ # def test_mllama_11b(self):
1029+ # with Runner('lmi', 'llama32-11b-multimodal') as r:
1030+ # prepare.build_vllm_model('llama32-11b-multimodal')
1031+ # r.launch()
1032+ # client.run("multimodal llama32-11b-multimodal".split())
10331033
10341034
10351035class TestLmiDistPipelineParallel :
0 commit comments