diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml index 90d001c172..6ae656b1dc 100644 --- a/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml +++ b/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml @@ -5,5 +5,4 @@ recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft scheme: W4A16_weight_asym_awq -save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq -gpu_memory_utilization: 0.85 \ No newline at end of file +save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml index cac6204625..220649a18b 100644 --- a/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml +++ b/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml @@ -6,4 +6,3 @@ dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft scheme: W4A16_weight_sym_awq save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq -gpu_memory_utilization: 0.85 diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 066affc69e..92afb54bb3 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -1,11 +1,14 @@ +import gc import os import re import shutil import sys +import time from pathlib import Path import pandas as pd import pytest +import torch import yaml from huggingface_hub import HfApi from loguru import logger @@ -138,6 +141,16 @@ def save_compressed_model(self): fp.write(recipe_yaml_str) session.reset() + # Release GPU memory before running vLLM + del self.oneshot_model + del self.tokenizer + + gc.collect() + torch.cuda.empty_cache() + torch.cuda.synchronize() + # Give GPU time to fully release memory + time.sleep(2) + if SKIP_HF_UPLOAD.lower() != "yes": logger.info("================= UPLOADING TO HUB ======================")