@@ -65,7 +65,7 @@ def fusion_op_check(test_pass_manager: TestPassManager, quant_key: QuantKey,
65
65
@pytest .mark .skipif (not current_platform .supports_fp8 (), reason = "Need FP8" )
66
66
@pytest .mark .skipif (not current_platform .is_cuda_alike (),
67
67
reason = "Only test CUDA and ROCm" )
68
- @create_new_process_for_each_test ()
68
+ @create_new_process_for_each_test ("spawn" )
69
69
def test_attention_fusion_v0 (example_prompts , monkeypatch , model : str ,
70
70
quant_key : QuantKey , use_triton_fa : bool ):
71
71
# Clean Dynamo cache to avoid reusing other test cases
@@ -97,6 +97,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
97
97
llm = LLM (model ,
98
98
enforce_eager = True ,
99
99
compilation_config = compile_config ,
100
+ gpu_memory_utilization = 0.4 ,
100
101
max_model_len = 2048 )
101
102
102
103
sampling_params = SamplingParams (temperature = 0.0 ,
@@ -129,6 +130,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
129
130
llm2 = LLM (model ,
130
131
enforce_eager = True ,
131
132
compilation_config = compile_config ,
133
+ gpu_memory_utilization = 0.4 ,
132
134
max_model_len = 2048 )
133
135
134
136
# check outputs
@@ -159,7 +161,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
159
161
@pytest .mark .skipif (not current_platform .supports_fp8 (), reason = "Need FP8" )
160
162
@pytest .mark .skipif (not current_platform .is_cuda_alike (),
161
163
reason = "Only test CUDA and ROCm" )
162
- @create_new_process_for_each_test ()
164
+ @create_new_process_for_each_test ("spawn" )
163
165
def test_attention_fusion_v1 (example_prompts , monkeypatch , model : str ,
164
166
quant_key : QuantKey , use_split_attention : bool ):
165
167
# Clean Dynamo cache to avoid reusing other test cases
@@ -203,7 +205,10 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
203
205
}
204
206
with set_current_vllm_config (vllm_config ):
205
207
backend = TestBackend () # also force disable caches
206
- llm = LLM (model , compilation_config = compile_config , max_model_len = 2048 )
208
+ llm = LLM (model ,
209
+ compilation_config = compile_config ,
210
+ gpu_memory_utilization = 0.4 ,
211
+ max_model_len = 2048 )
207
212
sampling_params = SamplingParams (temperature = 0.0 ,
208
213
max_tokens = 10 ,
209
214
top_p = 0.95 )
@@ -237,6 +242,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
237
242
quant_key = quant_key ,
238
243
compile_config = get_current_vllm_config ().compilation_config ))
239
244
llm2 = LLM (model ,
245
+ gpu_memory_utilization = 0.4 ,
240
246
compilation_config = compile_config ,
241
247
max_model_len = 2048 )
242
248
0 commit comments