Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
scheme: W4A16_weight_asym_awq
save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
gpu_memory_utilization: 0.85
save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
1 change: 0 additions & 1 deletion tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@ dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
scheme: W4A16_weight_sym_awq
save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq
gpu_memory_utilization: 0.85
13 changes: 13 additions & 0 deletions tests/e2e/vLLM/test_vllm.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import gc
import os
import re
import shutil
import sys
import time
from pathlib import Path

import pandas as pd
import pytest
import torch
import yaml
from huggingface_hub import HfApi
from loguru import logger
Expand Down Expand Up @@ -138,6 +141,16 @@ def save_compressed_model(self):
fp.write(recipe_yaml_str)
session.reset()

# Release GPU memory before running vLLM
del self.oneshot_model
del self.tokenizer

gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Give GPU time to fully release memory
time.sleep(2)

if SKIP_HF_UPLOAD.lower() != "yes":
logger.info("================= UPLOADING TO HUB ======================")

Expand Down
Loading