Trying to figure out why the test passes on ROCm, but OOMs on CUDA

gshtras · gshtras · commit becbd2ffa3b0 · 2025-08-06T22:37:08.000Z
Signed-off-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -65,7 +65,7 @@ def fusion_op_check(test_pass_manager: TestPassManager, quant_key: QuantKey,
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
 @pytest.mark.skipif(not current_platform.is_cuda_alike(),
                     reason="Only test CUDA and ROCm")
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
                              quant_key: QuantKey, use_triton_fa: bool):
     # Clean Dynamo cache to avoid reusing other test cases
@@ -97,6 +97,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
     llm = LLM(model,
               enforce_eager=True,
               compilation_config=compile_config,
+              gpu_memory_utilization=0.4,
               max_model_len=2048)
 
     sampling_params = SamplingParams(temperature=0.0,
@@ -129,6 +130,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
     llm2 = LLM(model,
                enforce_eager=True,
                compilation_config=compile_config,
+               gpu_memory_utilization=0.4,
                max_model_len=2048)
 
     # check outputs
@@ -159,7 +161,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
 @pytest.mark.skipif(not current_platform.is_cuda_alike(),
                     reason="Only test CUDA and ROCm")
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
                              quant_key: QuantKey, use_split_attention: bool):
     # Clean Dynamo cache to avoid reusing other test cases
@@ -203,7 +205,10 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
     }
     with set_current_vllm_config(vllm_config):
         backend = TestBackend()  # also force disable caches
-        llm = LLM(model, compilation_config=compile_config, max_model_len=2048)
+        llm = LLM(model,
+                  compilation_config=compile_config,
+                  gpu_memory_utilization=0.4,
+                  max_model_len=2048)
         sampling_params = SamplingParams(temperature=0.0,
                                          max_tokens=10,
                                          top_p=0.95)
@@ -237,6 +242,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
                 quant_key=quant_key,
                 compile_config=get_current_vllm_config().compilation_config))
         llm2 = LLM(model,
+                   gpu_memory_utilization=0.4,
                    compilation_config=compile_config,
                    max_model_len=2048)