1
1
# SPDX-License-Identifier: Apache-2.0
2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
import pytest
4
+ from torch_xla ._internal import tpu
4
5
5
6
import vllm
6
7
from vllm .lora .request import LoRARequest
@@ -27,25 +28,31 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
27
28
yield
28
29
29
30
30
- def setup_vllm (num_loras : int ) -> vllm .LLM :
31
+ def setup_vllm (num_loras : int , tp : int ) -> vllm .LLM :
31
32
return vllm .LLM (model = "Qwen/Qwen2.5-3B-Instruct" ,
32
33
num_scheduler_steps = 1 ,
33
34
max_model_len = 256 ,
34
35
max_seq_len_to_capture = 256 ,
35
36
max_num_seqs = 8 ,
37
+ tensor_parallel_size = tp ,
36
38
enable_lora = True ,
37
39
max_loras = num_loras ,
38
40
max_lora_rank = 8 )
39
41
40
42
41
- def test_single_lora ():
43
+ TPU_TENSOR_PARALLEL_SIZES = [1 , tpu .num_available_chips ()
44
+ ] if tpu .num_available_chips () > 1 else [1 ]
45
+
46
+
47
+ @pytest .mark .parametrize ("tp" , TPU_TENSOR_PARALLEL_SIZES )
48
+ def test_single_lora (tp : int ):
42
49
"""
43
50
This test ensures we can run a single LoRA adapter on the TPU backend.
44
51
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
45
52
will force Qwen2.5-3B-Instruct to claim 1+1=1.
46
53
"""
47
54
48
- llm = setup_vllm (1 )
55
+ llm = setup_vllm (1 , tp )
49
56
50
57
prompt = "What is 1+1? \n "
51
58
@@ -63,7 +70,8 @@ def test_single_lora():
63
70
assert int (answer ) == 1
64
71
65
72
66
- def test_lora_hotswapping ():
73
+ @pytest .mark .parametrize ("tp" , TPU_TENSOR_PARALLEL_SIZES )
74
+ def test_lora_hotswapping (tp : int ):
67
75
"""
68
76
This test ensures we can run multiple LoRA adapters on the TPU backend, even
69
77
if we only have space to store 1.
@@ -79,7 +87,7 @@ def test_lora_hotswapping():
79
87
for i in range (1 , 5 )
80
88
]
81
89
82
- llm = setup_vllm (1 )
90
+ llm = setup_vllm (1 , tp )
83
91
84
92
prompt = "What is 1+1? \n "
85
93
@@ -94,7 +102,8 @@ def test_lora_hotswapping():
94
102
assert int (answer ) == i + 1
95
103
96
104
97
- def test_multi_lora ():
105
+ @pytest .mark .parametrize ("tp" , TPU_TENSOR_PARALLEL_SIZES )
106
+ def test_multi_lora (tp : int ):
98
107
"""
99
108
This test ensures we can run multiple LoRA adapters on the TPU backend, when
100
109
we have enough space to store all of them.
@@ -109,7 +118,7 @@ def test_multi_lora():
109
118
for i in range (1 , 5 )
110
119
]
111
120
112
- llm = setup_vllm (4 )
121
+ llm = setup_vllm (4 , tp )
113
122
114
123
prompt = "What is 1+1? \n "
115
124
0 commit comments