@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
47
47
@pytest .mark .parametrize ("max_tokens" , [5 ])
48
48
@pytest .mark .parametrize ("enforce_eager" , [False ])
49
49
def test_models (
50
+ monkeypatch : pytest .MonkeyPatch ,
50
51
hf_runner ,
51
52
model : str ,
52
53
backend : str ,
@@ -63,31 +64,33 @@ def test_models(
63
64
pytest .skip (
64
65
f"{ backend } does not support gemma2 with full context length." )
65
66
66
- os .environ ["VLLM_ATTENTION_BACKEND" ] = backend
67
+ with monkeypatch .context () as m :
68
+ m .setenv ("VLLM_ATTENTION_BACKEND" , backend )
67
69
68
- # 5042 tokens for gemma2
69
- # gemma2 has alternating sliding window size of 4096
70
- # we need a prompt with more than 4096 tokens to test the sliding window
71
- prompt = "The following numbers of the sequence " + ", " .join (
72
- str (i ) for i in range (1024 )) + " are:"
73
- example_prompts = [prompt ]
70
+ # 5042 tokens for gemma2
71
+ # gemma2 has alternating sliding window size of 4096
72
+ # we need a prompt with more than 4096 tokens to test the sliding window
73
+ prompt = "The following numbers of the sequence " + ", " .join (
74
+ str (i ) for i in range (1024 )) + " are:"
75
+ example_prompts = [prompt ]
74
76
75
- with hf_runner (model , dtype = dtype ) as hf_model :
76
- hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
77
+ with hf_runner (model , dtype = dtype ) as hf_model :
78
+ hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
77
79
78
- with VllmRunner (model ,
79
- max_model_len = 8192 ,
80
- dtype = dtype ,
81
- enforce_eager = enforce_eager ,
82
- gpu_memory_utilization = 0.7 ) as vllm_model :
83
- vllm_outputs = vllm_model .generate_greedy (example_prompts , max_tokens )
80
+ with VllmRunner (model ,
81
+ max_model_len = 8192 ,
82
+ dtype = dtype ,
83
+ enforce_eager = enforce_eager ,
84
+ gpu_memory_utilization = 0.7 ) as vllm_model :
85
+ vllm_outputs = vllm_model .generate_greedy (example_prompts ,
86
+ max_tokens )
84
87
85
- check_outputs_equal (
86
- outputs_0_lst = hf_outputs ,
87
- outputs_1_lst = vllm_outputs ,
88
- name_0 = "hf" ,
89
- name_1 = "vllm" ,
90
- )
88
+ check_outputs_equal (
89
+ outputs_0_lst = hf_outputs ,
90
+ outputs_1_lst = vllm_outputs ,
91
+ name_0 = "hf" ,
92
+ name_1 = "vllm" ,
93
+ )
91
94
92
95
93
96
@multi_gpu_test (num_gpus = 2 )
@@ -104,6 +107,7 @@ def test_models(
104
107
("meta-llama/Meta-Llama-3-8B" , "ray" , "FLASHINFER" , "A100" ),
105
108
])
106
109
def test_models_distributed (
110
+ monkeypatch : pytest .MonkeyPatch ,
107
111
hf_runner ,
108
112
vllm_runner ,
109
113
example_prompts ,
@@ -116,34 +120,41 @@ def test_models_distributed(
116
120
if test_suite != TARGET_TEST_SUITE :
117
121
pytest .skip (f"Skip test for { test_suite } " )
118
122
119
- if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4" : # noqa
120
- # test Ray Compiled Graph
121
- os .environ ['VLLM_USE_RAY_SPMD_WORKER' ] = "1"
122
- os .environ ['VLLM_USE_RAY_COMPILED_DAG' ] = "1"
123
-
124
- if attention_backend :
125
- os .environ ["VLLM_ATTENTION_BACKEND" ] = attention_backend
126
-
127
- dtype = "half"
128
- max_tokens = 5
129
-
130
- # NOTE: take care of the order. run vLLM first, and then run HF.
131
- # vLLM needs a fresh new process without cuda initialization.
132
- # if we run HF first, the cuda initialization will be done and it
133
- # will hurt multiprocessing backend with fork method (the default method).
134
- with vllm_runner (model ,
135
- dtype = dtype ,
136
- tensor_parallel_size = 2 ,
137
- distributed_executor_backend = distributed_executor_backend
138
- ) as vllm_model :
139
- vllm_outputs = vllm_model .generate_greedy (example_prompts , max_tokens )
140
-
141
- with hf_runner (model , dtype = dtype ) as hf_model :
142
- hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
143
-
144
- check_outputs_equal (
145
- outputs_0_lst = hf_outputs ,
146
- outputs_1_lst = vllm_outputs ,
147
- name_0 = "hf" ,
148
- name_1 = "vllm" ,
149
- )
123
+ with monkeypatch .context () as monkeypatch_context :
124
+ if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4" : # noqa
125
+ # test Ray Compiled Graph
126
+ monkeypatch_context .setenv ("VLLM_USE_RAY_SPMD_WORKER" , "1" )
127
+ monkeypatch_context .setenv ("VLLM_USE_RAY_COMPILED_DAG" , "1" )
128
+
129
+ if attention_backend :
130
+ monkeypatch_context .setenv (
131
+ "VLLM_ATTENTION_BACKEND" ,
132
+ attention_backend ,
133
+ )
134
+
135
+ dtype = "half"
136
+ max_tokens = 5
137
+
138
+ # NOTE: take care of the order. run vLLM first, and then run HF.
139
+ # vLLM needs a fresh new process without cuda initialization.
140
+ # if we run HF first, the cuda initialization will be done and it
141
+ # will hurt multiprocessing backend with fork method
142
+ # (the default method).
143
+ with vllm_runner (
144
+ model ,
145
+ dtype = dtype ,
146
+ tensor_parallel_size = 2 ,
147
+ distributed_executor_backend = distributed_executor_backend ,
148
+ ) as vllm_model :
149
+ vllm_outputs = vllm_model .generate_greedy (example_prompts ,
150
+ max_tokens )
151
+
152
+ with hf_runner (model , dtype = dtype ) as hf_model :
153
+ hf_outputs = hf_model .generate_greedy (example_prompts , max_tokens )
154
+
155
+ check_outputs_equal (
156
+ outputs_0_lst = hf_outputs ,
157
+ outputs_1_lst = vllm_outputs ,
158
+ name_0 = "hf" ,
159
+ name_1 = "vllm" ,
160
+ )
0 commit comments