File tree Expand file tree Collapse file tree 6 files changed +11
-6
lines changed
Expand file tree Collapse file tree 6 files changed +11
-6
lines changed Original file line number Diff line number Diff line change 11google/gemma-3-27b-it :
22 - accuracy : 52.0
3+ - quant_algo : FP8
4+ kv_cache_quant_algo : FP8
5+ accuracy : 50.0
36Qwen/Qwen2-VL-7B-Instruct :
47 - accuracy : 48.44
58Qwen/Qwen2.5-VL-7B-Instruct :
Original file line number Diff line number Diff line change @@ -220,7 +220,8 @@ def test_auto_dtype(self):
220220@skip_post_blackwell
221221class TestGemma3_27BInstruct (LlmapiAccuracyTestHarness ):
222222 MODEL_NAME = "google/gemma-3-27b-it"
223- MODEL_PATH = f"{ llm_models_root ()} /gemma/gemma-3-27b-it/"
223+ # Note: This has only the LLM part quantized. Vision part is in bfloat16.
224+ MODEL_PATH = f"{ llm_models_root ()} /gemma/gemma-3-27b-it-fp8/"
224225 MAX_NUM_TOKENS = 12800
225226
226227 sampling_params = SamplingParams (
@@ -232,9 +233,10 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
232233 enable_block_reuse = False ,
233234 enable_partial_reuse = False ,
234235 free_gpu_memory_fraction = 0.4 ,
236+ dtype = "fp8" ,
235237 )
236238
237- def test_auto_dtype (self ):
239+ def test_fp8_prequantized (self ):
238240 # Gemma3 VLM needs FlashInfer attention backend for custom mask support.
239241 with LLM (
240242 self .MODEL_PATH ,
Original file line number Diff line number Diff line change @@ -675,7 +675,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
675675accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
676676accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
677677accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
678- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
678+ accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
679679accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
680680
681681test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
Original file line number Diff line number Diff line change @@ -236,7 +236,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
236236accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
237237accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
238238accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
239- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
239+ accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
240240
241241disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
242242disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
Original file line number Diff line number Diff line change @@ -241,7 +241,7 @@ accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
241241accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
242242accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
243243accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
244- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
244+ accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
245245accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
246246accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
247247
Original file line number Diff line number Diff line change @@ -71,6 +71,7 @@ l0_h100:
7171 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
7272 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
7373 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
74+ - accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
7475 - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
7576 - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
7677 - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
@@ -264,7 +265,6 @@ l0_h100:
264265 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
265266 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
266267 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
267- - accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_auto_dtype
268268 - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
269269 - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
270270 - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)
You can’t perform that action at this time.
0 commit comments