Trying to fit into the memory constrains of the CI CUDA machines

gshtras · gshtras · commit 8b42595d70e2 · 2025-08-08T17:52:57.000Z
Signed-off-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -97,7 +97,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
     llm = LLM(model,
               enforce_eager=True,
               compilation_config=compile_config,
-              gpu_memory_utilization=0.4,
+              gpu_memory_utilization=0.5,
               max_model_len=2048)
 
     sampling_params = SamplingParams(temperature=0.0,
@@ -130,7 +130,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
     llm2 = LLM(model,
                enforce_eager=True,
                compilation_config=compile_config,
-               gpu_memory_utilization=0.4,
+               gpu_memory_utilization=0.5,
                max_model_len=2048)
 
     # check outputs
@@ -207,7 +207,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
         backend = TestBackend()  # also force disable caches
         llm = LLM(model,
                   compilation_config=compile_config,
-                  gpu_memory_utilization=0.4,
+                  gpu_memory_utilization=0.5,
                   max_model_len=2048)
         sampling_params = SamplingParams(temperature=0.0,
                                          max_tokens=10,
@@ -242,7 +242,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
                 quant_key=quant_key,
                 compile_config=get_current_vllm_config().compilation_config))
         llm2 = LLM(model,
-                   gpu_memory_utilization=0.4,
+                   gpu_memory_utilization=0.5,
                    compilation_config=compile_config,
                    max_model_len=2048)