@@ -545,28 +545,14 @@ def test_llama3_8b(self):
545545
546546@pytest .mark .vllm
547547@pytest .mark .gpu_4
548- class TestVllm1 :
548+ class TestVllm :
549549
550550 def test_gemma_2b (self ):
551551 with Runner ('lmi' , 'gemma-2b' ) as r :
552552 prepare .build_vllm_model ("gemma-2b" )
553553 r .launch ()
554554 client .run ("vllm gemma-2b" .split ())
555555
556- def test_llama2_7b_chat (self ):
557- with Runner ('lmi' , 'llama2-7b-chat' ) as r :
558- prepare .build_vllm_async_model ("llama2-7b-chat" )
559- r .launch ()
560- client .run ("vllm_chat llama2-7b-chat" .split ())
561-
562- @pytest .mark .skipif (not is_applicable_cuda_capability (89 ),
563- reason = "Unsupported CUDA capability" )
564- def test_qwen2_7b_fp8 (self ):
565- with Runner ('lmi' , 'qwen2-7b-fp8' ) as r :
566- prepare .build_vllm_async_model ("qwen2-7b-fp8" )
567- r .launch ()
568- client .run ("vllm qwen2-7b-fp8" .split ())
569-
570556 def test_llama3_8b_chunked_prefill (self ):
571557 with Runner ('lmi' , 'llama3-8b-chunked-prefill' ) as r :
572558 prepare .build_vllm_model ("llama3-8b-chunked-prefill" )
@@ -576,7 +562,7 @@ def test_llama3_8b_chunked_prefill(self):
576562
577563 def test_falcon_11b_chunked_prefill (self ):
578564 with Runner ('lmi' , 'falcon-11b-chunked-prefill' ) as r :
579- prepare .build_vllm_async_model ("falcon-11b-chunked-prefill" )
565+ prepare .build_vllm_model ("falcon-11b-chunked-prefill" )
580566 r .launch ()
581567 client .run (
582568 "vllm falcon-11b-chunked-prefill --in_tokens 1200" .split ())
@@ -587,6 +573,31 @@ def test_llama_68m_speculative_medusa(self):
587573 r .launch ()
588574 client .run ("vllm llama-68m-speculative-medusa" .split ())
589575
576+ def test_vllm_performance (self ):
577+ with Runner ('lmi' , 'handler-performance-vllm' ) as r :
578+ prepare .build_handler_performance_model ("tiny-llama-vllm" )
579+ r .launch ("CUDA_VISIBLE_DEVICES=0" )
580+ client .run ("handler_performance vllm" .split ())
581+
582+
583+ @pytest .mark .vllm
584+ @pytest .mark .gpu_4
585+ class TestVllmAsync :
586+
587+ def test_llama2_7b_chat (self ):
588+ with Runner ('lmi' , 'llama2-7b-chat' ) as r :
589+ prepare .build_vllm_async_model ("llama2-7b-chat" )
590+ r .launch ()
591+ client .run ("vllm_chat llama2-7b-chat" .split ())
592+
593+ @pytest .mark .skipif (not is_applicable_cuda_capability (89 ),
594+ reason = "Unsupported CUDA capability" )
595+ def test_qwen2_7b_fp8 (self ):
596+ with Runner ('lmi' , 'qwen2-7b-fp8' ) as r :
597+ prepare .build_vllm_async_model ("qwen2-7b-fp8" )
598+ r .launch ()
599+ client .run ("vllm qwen2-7b-fp8" .split ())
600+
590601 def test_llama_68m_speculative_eagle (self ):
591602 with Runner ('lmi' , 'llama-68m-speculative-eagle' ) as r :
592603 prepare .build_vllm_async_model ("llama-68m-speculative-eagle" )
@@ -624,12 +635,6 @@ def test_tiny_llama_input_length_exceeded(self):
624635 client .run (
625636 "vllm tinyllama-input-len-exceeded --in_tokens 10" .split ())
626637
627- def test_vllm_performance (self ):
628- with Runner ('lmi' , 'handler-performance-vllm' ) as r :
629- prepare .build_handler_performance_model ("tiny-llama-vllm" )
630- r .launch ("CUDA_VISIBLE_DEVICES=0" )
631- client .run ("handler_performance vllm" .split ())
632-
633638
634639@pytest .mark .vllm
635640@pytest .mark .lora
0 commit comments