1
1
# SPDX-License-Identifier: Apache-2.0
2
- import tempfile
3
- from time import time
4
-
5
2
import pytest
6
3
7
4
from vllm import LLM , envs
15
12
)
16
13
17
14
18
- @pytest .mark .parametrize ("model_name" , ["D4nt3/Qwen2.5-two-layers" ])
19
- @pytest .mark .skipif (not current_platform .is_tpu (),
20
- reason = "This test needs a TPU" )
21
- def test_sampler_compilation (model_name : str , monkeypatch ):
22
- """
23
- Check that no recompilation happens despite changing sampling parameters.
24
- We can't read XLA metrics from the engine process, hence we measure time.
25
- """
26
- with tempfile .TemporaryDirectory () as temp_dir :
27
- monkeypatch .setenv ("VLLM_XLA_CACHE_PATH" , temp_dir )
28
- # Compiling model init may still take some time, enforce_eager to skip.
29
- llm = LLM (model_name ,
30
- enforce_eager = True ,
31
- max_num_seqs = 16 ,
32
- max_model_len = 1024 ,
33
- gpu_memory_utilization = 0.5 )
34
- prompts = [
35
- "A robot may not injure a human being" ,
36
- "It is only with the heart that one can see rightly;" ,
37
- ]
38
- # First inference should be slow
39
- sampling_params = SamplingParams (
40
- temperature = 0.7 ,
41
- # top_p=0.6, # TODO too slow!
42
- top_k = 10 ,
43
- min_p = 0.2 ,
44
- max_tokens = 16 )
45
- s = time ()
46
- _ = llm .generate (prompts , sampling_params )
47
- run1 = time () - s
48
-
49
- # Second request with different params, but for which we
50
- # compiled for in previous eager iteration.
51
- sampling_params = SamplingParams (temperature = 0.1 ,
52
- top_k = 12 ,
53
- min_p = 0.8 ,
54
- max_tokens = 24 )
55
- s = time ()
56
- _ = llm .generate (prompts , sampling_params )
57
- run2 = time () - s
58
- # Much faster after compiling
59
- assert run1 * 0.1 > run2
60
- print ("TIMES" , run1 , run2 )
61
-
62
- # Third request with min_p set to "None". It will not trigger
63
- # recompilation as a default 0 value will be used.
64
- sampling_params = SamplingParams (max_tokens = 24 , temperature = 0.0 )
65
- s = time ()
66
- _ = llm .generate (prompts , sampling_params )
67
- run3 = time () - s
68
- assert run1 * 0.1 > run3
69
- print ("TIMES" , run1 , run3 )
70
-
71
-
72
15
@pytest .mark .parametrize ("model_name" , ["Qwen/Qwen2.5-1.5B-Instruct" ])
73
16
@pytest .mark .skipif (not current_platform .is_tpu (),
74
17
reason = "This test needs a TPU" )
@@ -77,13 +20,11 @@ def test_sampler_different(model_name: str):
77
20
Test significantly different sampling params to assert the model produces
78
21
different results.
79
22
"""
80
- llm = LLM (
81
- model_name ,
82
- enforce_eager = True ,
83
- max_num_seqs = 1 ,
84
- max_model_len = 64 ,
85
- # TODO: setting to 0.5 or it will go OOM
86
- gpu_memory_utilization = 0.5 )
23
+ llm = LLM (model_name ,
24
+ enforce_eager = False ,
25
+ max_num_seqs = 1 ,
26
+ max_model_len = 512 ,
27
+ max_num_batched_tokens = 512 )
87
28
prompts = [
88
29
"Write a short story about a robot that dreams for the first time."
89
30
]
0 commit comments