-
Notifications
You must be signed in to change notification settings - Fork 453
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
โ๏ธ Your current environment
requirements.txt available at https://github.com/neuralmagic/GuardBench/blob/main/llama4guard_vllm_requirements.txt
๐ Describe the bug
I've tried to quantize meta-llama/Llama-Gurad-4-12B with LLM-Compressor and spawned two jobs in parallel on a single 8xH100 server. The entire server collapsed with "out of memory" msgs in logs.
Could there be a some kind of memory leak in LLM-Compressor?
Script for quantization:
from datasets import load_dataset
from transformers import AutoProcessor, Llama4ForConditionalGeneration
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot
import argparse
from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
def parse_actorder(value):
# Interpret the input value for --actorder
if value.lower() == "false":
return False
elif value.lower() == "group":
return "group"
elif value.lower() == "weight":
return "weight"
else:
raise argparse.ArgumentTypeError("Invalid value for --actorder. Use 'group', 'weight', or 'False'.")
def parse_sym(value):
if value.lower() == "false":
return False
elif value.lower() == "true":
return True
else:
raise argparse.ArgumentTypeError(f"Invalid value for --sym. Use false or true, but got {value}")
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, required=True)
parser.add_argument('--quant_path', type=str, required=True)
parser.add_argument('--group_size', type=int, required=True)
parser.add_argument('--calib_size', type=int, required=True)
parser.add_argument('--dampening_frac', type=float, required=True)
parser.add_argument('--observer', type=str, required=True) # mse or minmax
parser.add_argument('--sym', type=parse_sym, required=True) # true or false
parser.add_argument('--actorder', type=parse_actorder, required=True) # group or weight or false
parser.add_argument('--pipeline', type=str, default="basic") #['basic', 'datafree', 'sequential', independent]
args = parser.parse_args()
print(f"[DEBUGGING ARGS] {args}")
model = Llama4ForConditionalGeneration.from_pretrained(
args.model_path,
torch_dtype="auto",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True)
def preprocess_fn(example):
# prepare for multimodal processor
for msg in example["messages"]:
msg["content"] = [{'type': 'text', 'text': msg['content']}]
return {"text": processor.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
ds = ds.map(preprocess_fn)
print(f"================================================================================")
print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}")
print(f"================================================================================")
quant_scheme = QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(
num_bits=4,
type=QuantizationType.INT,
symmetric=args.sym,
group_size=args.group_size,
strategy=QuantizationStrategy.GROUP,
observer=args.observer,
actorder=args.actorder
),
input_activations=None,
output_activations=None,
)
recipe = [
GPTQModifier(
targets=["Linear"],
ignore=[
"re:.*lm_head",
"re:.*multi_modal_projector",
"re:.*vision_model",
],
dampening_frac=args.dampening_frac,
config_groups={"group_0": quant_scheme},
)
]
oneshot(
model=model,
dataset=ds,
recipe=recipe,
num_calibration_samples=args.calib_size,
max_seq_length=8192,
pipeline=args.pipeline,
)
SAVE_DIR = args.quant_path
model.save_pretrained(SAVE_DIR)
print(f"Model saved to {SAVE_DIR}")with:
python llmcompressor_int4.py --model_path meta-llama/Llama-Guard-4-12B --quant_path meta-llama/Llama-Guard-4-12B-quantized.w4a16_calib512_damp0.03_obsMinMax_sym_actorderWeight_pipelineSequential --group_size 128 --calib_size 512 --dampening_frac 0.03 --observer minmax --sym True --actorder weight --pipeline independent๐ ๏ธ Steps to reproduce
No response
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working