|
7 | 7 | """
|
8 | 8 | import pytest
|
9 | 9 |
|
| 10 | +from vllm import SamplingParams |
10 | 11 | from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
11 | 12 | ENABLE_ARTIFICIAL_PREEMPT)
|
12 | 13 |
|
@@ -136,3 +137,87 @@ def test_swap(
|
136 | 137 | assert hf_output_ids[j] == vllm_output_ids[j], (
|
137 | 138 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
|
138 | 139 | f"vLLM: {vllm_output_ids}")
|
| 140 | + |
| 141 | + |
| 142 | +@pytest.mark.parametrize("model", MODELS) |
| 143 | +@pytest.mark.parametrize("dtype", ["float"]) |
| 144 | +@pytest.mark.parametrize("max_tokens", [96]) |
| 145 | +@pytest.mark.parametrize("beam_width", [4]) |
| 146 | +def test_swap_infeasible( |
| 147 | + vllm_runner, |
| 148 | + example_prompts, |
| 149 | + model: str, |
| 150 | + dtype: str, |
| 151 | + max_tokens: int, |
| 152 | + beam_width: int, |
| 153 | +) -> None: |
| 154 | + """Verify infeasible swap request will be ignored.""" |
| 155 | + BLOCK_SIZE = 16 |
| 156 | + prefill_blocks = 2 |
| 157 | + decode_blocks = max_tokens // BLOCK_SIZE |
| 158 | + example_prompts = example_prompts[:1] |
| 159 | + |
| 160 | + vllm_model = vllm_runner( |
| 161 | + model, |
| 162 | + dtype=dtype, |
| 163 | + swap_space=10, |
| 164 | + block_size=BLOCK_SIZE, |
| 165 | + # Since beam search have more than 1 sequence, prefill + decode blocks |
| 166 | + # are not enough to finish. |
| 167 | + num_gpu_blocks_override=prefill_blocks + decode_blocks, |
| 168 | + max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, |
| 169 | + ) |
| 170 | + sampling_params = SamplingParams(n=beam_width, |
| 171 | + use_beam_search=True, |
| 172 | + temperature=0.0, |
| 173 | + max_tokens=max_tokens, |
| 174 | + ignore_eos=True) |
| 175 | + req_outputs = vllm_model.model.generate( |
| 176 | + example_prompts, |
| 177 | + sampling_params=sampling_params, |
| 178 | + ) |
| 179 | + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < |
| 180 | + ARTIFICIAL_PREEMPTION_MAX_CNT) |
| 181 | + del vllm_model |
| 182 | + # Verify the request is ignored and not hang. |
| 183 | + assert req_outputs[0].outputs[0].finish_reason == "length" |
| 184 | + |
| 185 | + |
| 186 | +@pytest.mark.parametrize("model", MODELS) |
| 187 | +@pytest.mark.parametrize("dtype", ["float"]) |
| 188 | +@pytest.mark.parametrize("max_tokens", [96]) |
| 189 | +def test_preemption_infeasible( |
| 190 | + vllm_runner, |
| 191 | + example_prompts, |
| 192 | + model: str, |
| 193 | + dtype: str, |
| 194 | + max_tokens: int, |
| 195 | +) -> None: |
| 196 | + """Verify infeasible preemption request will be ignored.""" |
| 197 | + BLOCK_SIZE = 16 |
| 198 | + prefill_blocks = 2 |
| 199 | + decode_blocks = max_tokens // BLOCK_SIZE |
| 200 | + vllm_model = vllm_runner( |
| 201 | + model, |
| 202 | + dtype=dtype, |
| 203 | + block_size=BLOCK_SIZE, |
| 204 | + # Not enough gpu blocks to complete a single sequence. |
| 205 | + # preemption should happen, and the sequence should be |
| 206 | + # ignored instead of hanging forever. |
| 207 | + num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, |
| 208 | + max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), |
| 209 | + ) |
| 210 | + sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) |
| 211 | + req_outputs = vllm_model.model.generate( |
| 212 | + example_prompts, |
| 213 | + sampling_params=sampling_params, |
| 214 | + ) |
| 215 | + |
| 216 | + assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt < |
| 217 | + ARTIFICIAL_PREEMPTION_MAX_CNT) |
| 218 | + del vllm_model |
| 219 | + # Verify the request is ignored and not hang. |
| 220 | + for req_output in req_outputs: |
| 221 | + outputs = req_output.outputs |
| 222 | + assert len(outputs) == 1 |
| 223 | + assert outputs[0].finish_reason == "length" |
0 commit comments