Skip to content

Commit 8b42595

Browse files
committed
Trying to fit into the memory constrains of the CI CUDA machines
Signed-off-by: Gregory Shtrasberg <[email protected]>
1 parent 61c34ae commit 8b42595

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

tests/compile/test_fusion_attn.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
9797
llm = LLM(model,
9898
enforce_eager=True,
9999
compilation_config=compile_config,
100-
gpu_memory_utilization=0.4,
100+
gpu_memory_utilization=0.5,
101101
max_model_len=2048)
102102

103103
sampling_params = SamplingParams(temperature=0.0,
@@ -130,7 +130,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
130130
llm2 = LLM(model,
131131
enforce_eager=True,
132132
compilation_config=compile_config,
133-
gpu_memory_utilization=0.4,
133+
gpu_memory_utilization=0.5,
134134
max_model_len=2048)
135135

136136
# check outputs
@@ -207,7 +207,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
207207
backend = TestBackend() # also force disable caches
208208
llm = LLM(model,
209209
compilation_config=compile_config,
210-
gpu_memory_utilization=0.4,
210+
gpu_memory_utilization=0.5,
211211
max_model_len=2048)
212212
sampling_params = SamplingParams(temperature=0.0,
213213
max_tokens=10,
@@ -242,7 +242,7 @@ def test_attention_fusion_v1(example_prompts, monkeypatch, model: str,
242242
quant_key=quant_key,
243243
compile_config=get_current_vllm_config().compilation_config))
244244
llm2 = LLM(model,
245-
gpu_memory_utilization=0.4,
245+
gpu_memory_utilization=0.5,
246246
compilation_config=compile_config,
247247
max_model_len=2048)
248248

0 commit comments

Comments
 (0)