Skip to content

Commit becbd2f

Browse files
committed
Trying to figure out why the test passes on ROCm, but OOMs on CUDA
Signed-off-by: Gregory Shtrasberg <[email protected]>
1 parent ea3e55a commit becbd2f

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

tests/compile/test_fusion_attn.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def fusion_op_check(test_pass_manager: TestPassManager, quant_key: QuantKey,
6565
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
6666
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
6767
reason="Only test CUDA and ROCm")
68-
@create_new_process_for_each_test()
68+
@create_new_process_for_each_test("spawn")
6969
def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
7070
quant_key: QuantKey, use_triton_fa: bool):
7171
# Clean Dynamo cache to avoid reusing other test cases
@@ -97,6 +97,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
9797
llm = LLM(model,
9898
enforce_eager=True,
9999
compilation_config=compile_config,
100+
gpu_memory_utilization=0.4,
100101
max_model_len=2048)
101102

102103
sampling_params = SamplingParams(temperature=0.0,
@@ -129,6 +130,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
129130
llm2 = LLM(model,
130131
enforce_eager=True,
131132
compilation_config=compile_config,
133+
gpu_memory_utilization=0.4,
132134
max_model_len=2048)
133135

134136
# check outputs
@@ -159,7 +161,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
159161
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
160162
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
161163
reason="Only test CUDA and ROCm")
162-
@create_new_process_for_each_test()
164+
@create_new_process_for_each_test("spawn")
163165
def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
164166
quant_key: QuantKey, use_split_attention: bool):
165167
# Clean Dynamo cache to avoid reusing other test cases
@@ -203,7 +205,10 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
203205
}
204206
with set_current_vllm_config(vllm_config):
205207
backend = TestBackend() # also force disable caches
206-
llm = LLM(model, compilation_config=compile_config, max_model_len=2048)
208+
llm = LLM(model,
209+
compilation_config=compile_config,
210+
gpu_memory_utilization=0.4,
211+
max_model_len=2048)
207212
sampling_params = SamplingParams(temperature=0.0,
208213
max_tokens=10,
209214
top_p=0.95)
@@ -237,6 +242,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
237242
quant_key=quant_key,
238243
compile_config=get_current_vllm_config().compilation_config))
239244
llm2 = LLM(model,
245+
gpu_memory_utilization=0.4,
240246
compilation_config=compile_config,
241247
max_model_len=2048)
242248

0 commit comments

Comments
 (0)