This repository was archived by the owner on Sep 4, 2025. It is now read-only.
File tree Expand file tree Collapse file tree 2 files changed +13
-5
lines changed Expand file tree Collapse file tree 2 files changed +13
-5
lines changed Original file line number Diff line number Diff line change @@ -116,6 +116,9 @@ def test_models_with_fp8_kv_cache(
116
116
pytest .skip (
117
117
"#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
118
118
)
119
+ if ((model , kv_cache_dtype , chunked_prefill_token_size ) == (
120
+ "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V" , "fp8_e4m3" , 4 )):
121
+ pytest .skip ("flakey test, see: #7874 #8051" )
119
122
120
123
max_num_seqs = chunked_prefill_token_size
121
124
max_num_batched_tokens = chunked_prefill_token_size
Original file line number Diff line number Diff line change @@ -1027,16 +1027,21 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1027
1027
1028
1028
# Update waiting requests.
1029
1029
self .waiting .extendleft (running_scheduled .preempted )
1030
+
1030
1031
# Update new running requests.
1031
- self .running .extend ([s .seq_group for s in prefills .seq_groups ])
1032
- self .running .extend (
1033
- [s .seq_group for s in running_scheduled .decode_seq_groups ])
1034
- self .running .extend (
1035
- [s .seq_group for s in running_scheduled .prefill_seq_groups ])
1032
+ # By default, vLLM scheduler prioritizes prefills.
1033
+ # Once chunked prefill is enabled,
1034
+ # the policy is changed to prioritize decode requests.
1036
1035
self .running .extend (
1037
1036
[s .seq_group for s in swapped_in .decode_seq_groups ])
1038
1037
self .running .extend (
1039
1038
[s .seq_group for s in swapped_in .prefill_seq_groups ])
1039
+ self .running .extend (
1040
+ [s .seq_group for s in running_scheduled .decode_seq_groups ])
1041
+ self .running .extend (
1042
+ [s .seq_group for s in running_scheduled .prefill_seq_groups ])
1043
+ self .running .extend ([s .seq_group for s in prefills .seq_groups ])
1044
+
1040
1045
# Update swapped requests.
1041
1046
self .swapped .extend (running_scheduled .swapped_out )
1042
1047
return SchedulerOutputs (
You can’t perform that action at this time.
0 commit comments