6161 - pytest -v -s -m 'not cpu_test' multimodal
6262 - pytest -v -s utils_
6363
64- - label : Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
64+ - label : Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6565 timeout_in_minutes : 10
6666 mirror_hardwares : [amdexperimental, amdproduction]
6767 agent_pool : mi325_1
@@ -73,13 +73,15 @@ steps:
7373 - tests/multimodal
7474 - tests/standalone_tests/lazy_imports.py
7575 - tests/transformers_utils
76+ - tests/config
7677 no_gpu : true
7778 commands :
7879 - python3 standalone_tests/lazy_imports.py
7980 - pytest -v -s test_inputs.py
8081 - pytest -v -s test_outputs.py
8182 - pytest -v -s -m 'cpu_test' multimodal
8283 - pytest -v -s transformers_utils
84+ - pytest -v -s config
8385
8486- label : Python-only Installation Test # 10min
8587 timeout_in_minutes : 20
@@ -187,7 +189,7 @@ steps:
187189 - tests/distributed/test_utils
188190 - tests/distributed/test_pynccl
189191 - tests/distributed/test_events
190- - tests/compile/test_basic_correctness
192+ - tests/compile/fullgraph/ test_basic_correctness.py
191193 - examples/offline_inference/rlhf.py
192194 - examples/offline_inference/rlhf_colocate.py
193195 - tests/examples/offline_inference/data_parallel.py
@@ -215,7 +217,7 @@ steps:
215217 - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
216218 - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
217219 - pytest -v -s distributed/test_utils.py
218- - pytest -v -s compile/test_basic_correctness.py
220+ - pytest -v -s compile/fullgraph/ test_basic_correctness.py
219221 - pytest -v -s distributed/test_pynccl.py
220222 - pytest -v -s distributed/test_events.py
221223 - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -390,6 +392,15 @@ steps:
390392 commands :
391393 - pytest -v -s v1/attention
392394
395+ - label : V1 Test attention (B200) # 10min
396+ timeout_in_minutes : 30
397+ gpu : b200
398+ source_file_dependencies :
399+ - vllm/v1/attention
400+ - tests/v1/attention
401+ commands :
402+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
403+
393404- label : V1 Test others (CPU) # 5 mins
394405 mirror_hardwares : [amdexperimental, amdproduction]
395406 agent_pool : mi325_1
@@ -493,17 +504,12 @@ steps:
493504 - vllm/
494505 - tests/compile
495506 commands :
496- - pytest -v -s compile/test_pass_manager.py
497- - pytest -v -s compile/test_fusion.py
498- - pytest -v -s compile/test_fusion_attn.py
499- - pytest -v -s compile/test_functionalization.py
500- - pytest -v -s compile/test_silu_mul_quant_fusion.py
501- # - pytest -v -s compile/test_sequence_parallelism.py
502- # - pytest -v -s compile/test_async_tp.py
503- - pytest -v -s compile/test_fusion_all_reduce.py
504- - pytest -v -s compile/test_decorator.py
505- - pytest -v -s compile/test_noop_elimination.py
506- - pytest -v -s compile/test_aot_compile.py
507+ # Run unit tests defined directly under compile/,
508+ # not including subdirectories, which are usually heavier
509+ # tests covered elsewhere.
510+ # Use `find` to launch multiple instances of pytest so that
511+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
512+ - " find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\ ;"
507513
508514- label : PyTorch Fullgraph Smoke Test # 15min
509515 timeout_in_minutes : 30
@@ -515,9 +521,11 @@ steps:
515521 - vllm/
516522 - tests/compile
517523 commands :
518- - pytest -v -s compile/test_basic_correctness.py
519- - pytest -v -s compile/test_multimodal_compile.py
520- - pytest -v -s compile/piecewise/
524+ # Run smoke tests under fullgraph directory, except test_full_graph.py
525+ # as it is a heavy test that is covered in other steps.
526+ # Use `find` to launch multiple instances of pytest so that
527+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
528+ - " find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\ ;"
521529
522530- label : PyTorch Fullgraph Test # 27min
523531 timeout_in_minutes : 40
@@ -529,10 +537,10 @@ steps:
529537 - vllm/
530538 - tests/compile
531539 commands :
532- - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
540+ - pytest -v -s compile/fullgraph/ test_full_graph.py -k 'not test_fp8_kv_scale_compile'
533541 # Limit to no custom ops to reduce running time
534542 # Wrap with quotes to escape yaml and avoid starting -k string with a -
535- - " pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and - quant_fp8'"
543+ - " pytest -v -s compile/distributed/ test_fusions_e2e.py -k 'TRITON and not + quant_fp8 and not Llama-4 '"
536544
537545- label : Cudagraph test
538546 timeout_in_minutes : 20
@@ -697,7 +705,7 @@ steps:
697705 - vllm/model_executor/models/whisper.py
698706 commands : # LMEval
699707 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
700- - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
708+ - pytest -s entrypoints/openai/correctness/
701709
702710- label : OpenAI-Compatible Tool Use # 23 min
703711 timeout_in_minutes : 35
@@ -998,12 +1006,12 @@ steps:
9981006 optional : true
9991007 commands :
10001008 - pip install --upgrade git+https://github.com/huggingface/transformers
1001- - pytest -v -s tests/models/test_initialization.py
1009+ - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
10021010 - pytest -v -s tests/models/test_transformers.py
1003- - pytest -v -s tests/models/multimodal/processing/
1004- - pytest -v -s tests/models/multimodal/test_mapping.py
1011+ # - pytest -v -s tests/models/multimodal/processing/
1012+ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
10051013 - python3 examples/offline_inference/basic/chat.py
1006- - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1014+ # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
10071015 # Whisper needs spawn method to avoid deadlock
10081016 - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
10091017
@@ -1048,7 +1056,7 @@ steps:
10481056 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10491057 - pytest -v -s tests/kernels/moe/test_flashinfer.py
10501058
1051- - label : Blackwell Fusion Tests # 30 min
1059+ - label : Blackwell Fusion and Compile Tests # 30 min
10521060 timeout_in_minutes : 40
10531061 working_dir : " /vllm-workspace/"
10541062 gpu : b200
@@ -1066,10 +1074,12 @@ steps:
10661074 - pytest -v -s tests/compile/test_fusion_attn.py
10671075 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
10681076 # this runner has 2 GPUs available even though num_gpus=2 is not set
1069- - pytest -v -s tests/compile/test_fusion_all_reduce.py
1077+ - pytest -v -s tests/compile/distributed/ test_fusion_all_reduce.py
10701078 # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10711079 # Wrap with quotes to escape yaml
1072- - " pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1080+ - " pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
1081+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082+ - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
10731083
10741084- label : Blackwell Fusion E2E Tests # 30 min
10751085 timeout_in_minutes : 40
@@ -1086,20 +1096,18 @@ steps:
10861096 - vllm/model_executor/layers/layernorm.py
10871097 - vllm/model_executor/layers/activation.py
10881098 - vllm/model_executor/layers/quantization/input_quant_fp8.py
1089- - tests/compile/test_fusions_e2e.py
1090- - tests/compile/test_full_graph.py
1099+ - tests/compile/distributed/ test_fusions_e2e.py
1100+ - tests/compile/fullgraph/ test_full_graph.py
10911101 commands :
10921102 - nvidia-smi
10931103 # Run all e2e fusion tests
10941104 - pytest -v -s tests/compile/test_fusions_e2e.py
1095- # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1096- - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
10971105
10981106- label : ROCm GPT-OSS Eval
10991107 timeout_in_minutes : 60
11001108 working_dir : " /vllm-workspace/"
11011109 agent_pool : mi325_1
1102- mirror_hardwares : [amdproduction]
1110+ mirror_hardwares : [amdexperimental, amdproduction]
11031111 optional : true # run on nightlies
11041112 source_file_dependencies :
11051113 - tests/evals/gpt_oss
@@ -1198,7 +1206,7 @@ steps:
11981206 - vllm/worker/worker_base.py
11991207 - vllm/v1/engine/
12001208 - vllm/v1/worker/
1201- - tests/compile/test_basic_correctness.py
1209+ - tests/compile/fullgraph/ test_basic_correctness.py
12021210 - tests/compile/test_wrapper.py
12031211 - tests/distributed/
12041212 - tests/entrypoints/llm/test_collective_rpc.py
@@ -1211,7 +1219,7 @@ steps:
12111219 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
12121220 - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
12131221 - pytest -v -s entrypoints/llm/test_collective_rpc.py
1214- - pytest -v -s ./compile/test_basic_correctness.py
1222+ - pytest -v -s ./compile/fullgraph/ test_basic_correctness.py
12151223 - pytest -v -s ./compile/test_wrapper.py
12161224 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
12171225 - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1326,21 +1334,20 @@ steps:
13261334 - vllm/
13271335 - tests/weight_loading
13281336 commands :
1329- - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
1337+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd .txt
13301338
13311339- label : Weight Loading Multiple GPU Test - Large Models # optional
13321340 mirror_hardwares : [amdexperimental]
13331341 agent_pool : mi325_2
13341342 # grade: Blocking
13351343 working_dir : " /vllm-workspace/tests"
13361344 num_gpus : 2
1337- gpu : a100
13381345 optional : true
13391346 source_file_dependencies :
13401347 - vllm/
13411348 - tests/weight_loading
13421349 commands :
1343- - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1350+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd .txt
13441351
13451352- label : NixlConnector PD accuracy tests (Distributed) # 30min
13461353 mirror_hardwares : [amdexperimental]
@@ -1417,10 +1424,12 @@ steps:
14171424 working_dir : " /vllm-workspace/"
14181425 num_gpus : 2
14191426 commands :
1420- - pytest -v -s tests/compile/test_async_tp.py
1421- - pytest -v -s tests/compile/test_sequence_parallelism.py
1422- - pytest -v -s tests/compile/test_fusion_all_reduce.py
1423- - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1427+ - pytest -v -s tests/compile/distributed/test_async_tp.py
1428+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
1429+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1430+ # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1431+ - " pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1432+ - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
14241433 - pytest -v -s tests/distributed/test_context_parallel.py
14251434 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14261435 - pytest -v -s tests/v1/distributed/test_dbo.py
0 commit comments