5
5
6
6
steps :
7
7
- label : Regression Test
8
+ mirror_hardwares : [amd]
8
9
command : pytest -v -s test_regression.py
9
10
working_dir : " /vllm-workspace/tests" # optional
10
11
11
12
- label : AsyncEngine Test
13
+ # mirror_hardwares: [amd]
12
14
command : pytest -v -s async_engine
13
15
14
16
- label : Basic Correctness Test
17
+ mirror_hardwares : [amd]
15
18
commands :
16
19
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
17
20
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -24,34 +27,40 @@ steps:
24
27
command : pytest -v -s core
25
28
26
29
- label : Distributed Comm Ops Test
27
- command : pytest -v -s test_comm_ops.py
28
- working_dir : " /vllm-workspace/tests/distributed"
30
+ # mirror_hardwares: [amd]
31
+ command : pytest -v -s distributed/test_comm_ops.py
32
+ working_dir : " /vllm-workspace/tests"
29
33
num_gpus : 2
30
34
31
35
- label : Distributed Tests
32
- working_dir : " /vllm-workspace/tests/distributed"
33
-
34
- num_gpus : 2 # only support 1 or 2 for now.
35
36
mirror_hardwares : [amd]
36
-
37
+ working_dir : " /vllm-workspace/tests"
38
+ num_gpus : 2
37
39
commands :
38
- - pytest -v -s test_pynccl_library.py
39
- - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
40
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
41
- - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
42
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
40
+ - pytest -v -s distributed/test_pynccl_library.py
41
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
42
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
43
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
44
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
45
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
46
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
47
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
48
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
49
+ - pytest -v -s spec_decode/e2e/test_integration_dist.py
43
50
44
51
- label : Distributed Tests (Multiple Groups)
45
- working_dir : " /vllm-workspace/tests/distributed"
52
+ # mirror_hardwares: [amd]
53
+ working_dir : " /vllm-workspace/tests"
46
54
num_gpus : 4
47
55
commands :
48
- - pytest -v -s test_pynccl.py
56
+ - pytest -v -s distributed/ test_pynccl.py
49
57
50
58
- label : Engine Test
51
59
mirror_hardwares : [amd]
52
60
command : pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
53
61
54
62
- label : Entrypoints Test
63
+ # mirror_hardwares: [amd]
55
64
commands :
56
65
# these tests have to be separated, because each one will allocate all posible GPU memory
57
66
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
@@ -62,21 +71,24 @@ steps:
62
71
mirror_hardwares : [amd]
63
72
commands :
64
73
# install aws cli for llava_example.py
65
- - pip install awscli
74
+ # install tensorizer for tensorize_vllm_model.py
75
+ - pip install awscli tensorizer
66
76
- python3 offline_inference.py
67
77
- python3 offline_inference_with_prefix.py
68
78
- python3 llm_engine_example.py
69
79
- python3 llava_example.py
80
+ - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
70
81
71
82
- label : Kernels Test %N
83
+ # mirror_hardwares: [amd]
72
84
command : pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
73
85
parallelism : 4
74
86
75
87
- label : Models Test
76
- mirror_hardwares : [amd]
88
+ # mirror_hardwares: [amd]
77
89
commands :
78
90
- bash ../.buildkite/download-images.sh
79
- - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
91
+ - pytest -v -s models --ignore=models/test_llava.py
80
92
81
93
- label : Llava Test
82
94
mirror_hardwares : [amd]
@@ -90,6 +102,7 @@ steps:
90
102
- pytest -v -s prefix_caching
91
103
92
104
- label : Samplers Test
105
+ # mirror_hardwares: [amd]
93
106
command : pytest -v -s samplers
94
107
95
108
- label : LogitsProcessor Test
@@ -101,20 +114,38 @@ steps:
101
114
command : pytest -v -s worker
102
115
103
116
- label : Speculative decoding tests
104
- mirror_hardwares : [amd]
117
+ # mirror_hardwares: [amd]
105
118
command : pytest -v -s spec_decode
106
119
107
120
- label : LoRA Test %N
108
- command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
121
+ # mirror_hardwares: [amd]
122
+ command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
109
123
parallelism : 4
110
124
125
+ - label : LoRA Long Context (Distributed)
126
+ # mirror_hardwares: [amd]
127
+ num_gpus : 4
128
+ # This test runs llama 13B, so it is required to run on 4 GPUs.
129
+ commands :
130
+ # Temporarily run this way because we cannot clean up GPU mem usage
131
+ # for multi GPU tests.
132
+ # TODO(sang): Fix it.
133
+ - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
134
+ - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
135
+ - pytest -v -s lora/test_long_context.py::test_self_consistency
136
+ - pytest -v -s lora/test_long_context.py::test_quality
137
+ - pytest -v -s lora/test_long_context.py::test_max_len
138
+
111
139
- label : Tensorizer Test
140
+ # mirror_hardwares: [amd]
112
141
command : apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
113
142
114
143
- label : Metrics Test
144
+ mirror_hardwares : [amd]
115
145
command : pytest -v -s metrics
116
146
117
147
- label : Quantization Test
148
+ # mirror_hardwares: [amd]
118
149
command : pytest -v -s quantization
119
150
120
151
- label : Benchmarks
0 commit comments