Fix vLLM slow test OOM by reducing GPU memory utilization and improving cleanup

NathanHB · NathanHB · commit f54496a038e9 · 2026-02-19T15:36:59.000Z
The vLLM slow tests were failing with OOM errors when running after
accelerate tests. The issue was:
1. vLLM V1 engine requires a specific amount of free GPU memory at startup
2. After accelerate tests, only 5.89 GiB was free (out of 14.74 GiB)
3. vLLM with gpu_memory_utilization=0.6 wanted 8.84 GiB

Fixes:
- Reduce gpu_memory_utilization from 0.6 to 0.35 in test config (needs 5.16 GiB)
- Add GPU memory cleanup fixture in conftest.py that runs before/after slow tests
- Improve AsyncVLLMModel.cleanup() to properly delete model object

The gpu_memory_utilization parameter only affects KV cache allocation and
does not impact model outputs with temperature=0.0, so this change is safe.
diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
@@ -5,7 +5,7 @@ model_parameters:
   tensor_parallel_size: 1
   data_parallel_size: 1
   pipeline_parallel_size: 1
-  gpu_memory_utilization: 0.6
+  gpu_memory_utilization: 0.35
   max_model_length: null
   swap_space: 4
   seed: 42
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -544,6 +544,8 @@ class AsyncVLLMModel(VLLMModel):
     is_async = True
 
     def cleanup(self):
+        if self.model is not None:
+            del self.model
         gc.collect()
         destroy_distributed_environment()
         torch.cuda.empty_cache()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,6 +2,8 @@
 
 # Copyright (c) 2024 The HuggingFace Team
 
+import gc
+
 import pytest
 
 
@@ -21,3 +23,32 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "slow" in item.keywords:
             item.add_marker(skip_slow)
+
+
+@pytest.fixture(autouse=True, scope="function")
+def cleanup_gpu_memory(request):
+    """Cleanup GPU memory before and after each test to prevent OOM errors."""
+    # Cleanup before test (especially important for tests that run after other GPU-heavy tests)
+    if "slow" in request.keywords:
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+        except ImportError:
+            pass
+        gc.collect()
+
+    yield
+
+    # Cleanup after test
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+    except ImportError:
+        pass
+    gc.collect()