vllm-project · kylesayrs · Feb 19, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 19, 2026
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml
@@ -5,5 +5,4 @@ recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 scheme: W4A16_weight_asym_awq
-save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
-gpu_memory_utilization: 0.85
+save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml
@@ -6,4 +6,3 @@ dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 scheme: W4A16_weight_sym_awq
 save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq
-gpu_memory_utilization: 0.85
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -1,11 +1,14 @@
+import gc
 import os
 import re
 import shutil
 import sys
+import time
 from pathlib import Path
 
 import pandas as pd
 import pytest
+import torch
 import yaml
 from huggingface_hub import HfApi
 from loguru import logger
@@ -138,6 +141,16 @@ def save_compressed_model(self):
             fp.write(recipe_yaml_str)
         session.reset()
 
+        # Release GPU memory before running vLLM
+        del self.oneshot_model
+        del self.tokenizer
+
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        # Give GPU time to fully release memory
+        time.sleep(2)
+
         if SKIP_HF_UPLOAD.lower() != "yes":
             logger.info("================= UPLOADING TO HUB ======================")