-
Notifications
You must be signed in to change notification settings - Fork 300
Open
Labels
autoroundFor any PR / issue related to autoround supportFor any PR / issue related to autoround support
Description
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Any, Dict, List
import argparse
# CUDA_VISIBLE_DEVICES=3 python /data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm_compressor.py
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from llmcompressor.modifiers.autoround import AutoRoundModifier
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", default="/data2/jcxy/llm_model/Qwen2.5-0.5B", type=str)
parser.add_argument("--quant_path", default="/data2/jcxy/llm_model/Qwen2.5-0.5B-AWQ", type=str)
parser.add_argument("--data_path", default="/data/jcxy/haolu/workspace/frameworks/arena-hard-simple/data/arena-hard-v2.0/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM.jsonl", type=str)
args = parser.parse_args()
model_path = args.model_path
print(model_path)
quant_path = args.quant_path
print(quant_path)
data_path = args.data_path
print(data_path)
# Select model and load it.
MODEL_ID = model_path
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Select number of samples. 256 samples is a good place to start.
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 8192
# Load dataset and preprocess.
raw_dataset = load_dataset("json", data_files=data_path, split="train")
raw_dataset = raw_dataset.shuffle(seed=42)
def preprocess(data_point):
full_conversation = data_point['messages']
full_text = tokenizer.apply_chat_template(
full_conversation,
tokenize=False,
add_generation_prompt=False,
)
return {"text": full_text}
# 1. Map to create the 'text' column
processed_dataset = raw_dataset.map(
preprocess,
num_proc=12,
desc="Generating calibration text"
)
# 2. Remove all original columns.
original_columns = list(raw_dataset.features)
processed_dataset = processed_dataset.remove_columns(original_columns)
# 3. *** 这是修改的核心部分 ***
# 我们不再过滤掉超长的样本,而是将它们截断到 MAX_SEQUENCE_LENGTH。
def truncate_examples(batch):
# tokenizer会处理一个批次的文本,并根据max_length进行截断
tokenized_output = tokenizer(
batch["text"],
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
)
# 将截断后的token解码回文本字符串
batch["text"] = tokenizer.batch_decode(tokenized_output["input_ids"], skip_special_tokens=True)
return batch
processed_dataset = processed_dataset.map(
truncate_examples,
batched=True, # 使用批处理以获得更快的速度
num_proc=12,
desc=f"Truncating samples to max length {MAX_SEQUENCE_LENGTH}"
)
# 3. Filter by length after creating the text and before passing to oneshot
# Note: This is less efficient than filtering before, but safer for complex pipelines.
# A better way is to check length inside preprocess and return None for long samples, then filter.
# For simplicity, we filter here.
processed_dataset = processed_dataset.filter(
lambda example: len(tokenizer(example["text"]).input_ids) <= MAX_SEQUENCE_LENGTH,
num_proc=12,
desc=f"Filtering samples with length <= {MAX_SEQUENCE_LENGTH}"
)
# config_groups = {
# "group_0": {
# "targets": ["Linear"],
# "input_activations": None,
# "output_activations": None,
# "weights": {
# "num_bits": 8,
# "type": "int",
# "symmetric": True,
# "strategy": "group",
# "group_size": 128,
# }
# }
# }
# recipe = [
# AWQModifier(
# ignore=["lm_head", "re:.*mlp.gate$"],
# config_groups=config_groups,
# ),
# ]
# 定义 W8A16 最优配方
# 构建符合你要求的 W8A16 配方
# recipe = [
# AWQModifier(
# targets=["Linear"],
# # 仅忽略最敏感的头和 Embedding,MoE 的 Expert 可以放心量化
# ignore=['lm_head', 'model.embed_tokens', 're:.*input_layernorm$', 're:.*post_attention_layernorm$',
# 'model.norm', 're:.*shared_experts.*', 're:model.layers.0.*', 're:.*mlp.gate'],
# config_groups={
# "group_0": {
# "targets": [
# "re:.*gate_proj.*", "re:.*up_proj.*", "re:.*down_proj.*",
# "re:.*k_proj.*", "re:.*q_proj.*", "re:.*v_proj.*", "re:.*o_proj.*"
# ],
# "weights": {
# "num_bits": 8,
# "type": "int",
# "symmetric": True,
# "group_size": 128, # W8 黄金标准
# "strategy": "group",
# "dynamic": False,
# "actorder": None,
# "observer": "mse",
# "observer_kwargs": {}
# },
# "input_activations": None,
# "output_activations": None,
# "format": None
# }
# },
# # 显式定义映射 (参考你提供的配置)
# mappings=[
# AWQMapping(smooth_layer="re:.*input_layernorm$", balance_layers=["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"]),
# AWQMapping(smooth_layer="re:.*v_proj$", balance_layers=["re:.*o_proj$"]),
# AWQMapping(smooth_layer="re:.*post_attention_layernorm$", balance_layers=["re:.*gate_proj$", "re:.*up_proj$"]),
# AWQMapping(smooth_layer="re:.*up_proj$", balance_layers=["re:.*down_proj$"]),
# ],
# duo_scaling=True
# )
# ]
# Configure the quantization algorithm to run.
recipe = AutoRoundModifier(
targets="Linear", scheme="W4A16", ignore=['lm_head'], iters=200
)
# Apply algorithms.
oneshot(
model=model,
dataset=processed_dataset,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save to disk compressed.
SAVE_DIR = quant_path
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
(haolu) haolu@xya800:/data/jcxy/haolu/workspace/training/alignment-handbook/scripts$ bash quantize-awq.sh
Activating Conda environment: haolu
Starting model quantization (AWQ)...
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
warnings.warn(
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
warnings.warn(
[2025-11-30 08:56:30,725] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/neural_compressor/utils/utility.py:50: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
from pkg_resources import parse_version
2025-11-30 08:56:32 WARNING __init__.py L22: AutoScheme is currently supported only on Linux.
/data2/jcxy/llm_model/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM-E
/data2/jcxy/llm_model/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM-E-W4A16-G128-AutoRoundV3
/data/jcxy/haolu/workspace/frameworks/arena-hard-simple/data/arena-hard-v2.0/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM.jsonl
2025-11-30 08:56:32 WARNING modeling_utils.py L4670: `torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:49<00:00, 1.08it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
Tokenizing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 8800/8800 [01:28<00:00, 99.02 examples/s]
2025-11-30T08:59:01.192353+0800 | reset | INFO - Compression lifecycle reset
2025-11-30T08:59:01.192861+0800 | from_modifiers | INFO - Creating recipe from modifiers
2025-11-30T08:59:07.722660+0800 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-11-30T08:59:07.769677+0800 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AutoRoundModifier`
2025-11-30T09:00:49.767301+0800 | get_sequential_targets | WARNING - Passing sequential targets through modifiers is deprecated, please use `oneshot(sequential_targets=...)`
Preparing cache: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:16<00:00, 7.89it/s]
(1/47): Calibrating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:08<00:00, 14.39it/s]
2025-11-30T09:01:30.256928+0800 | apply_autoround | INFO - Applying AutoRound on layer model.layers.0
2025-11-30 09:01:30 INFO base.py L365: using torch.bfloat16 for quantization tuning
Traceback (most recent call last):
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm_compressor-autoround.py", line 155, in <module>
oneshot(
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 347, in oneshot
one_shot()
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 172, in __call__
self.apply_recipe_modifiers(
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 220, in apply_recipe_modifiers
pipeline(
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/pipelines/independent/pipeline.py", line 45, in __call__
pipeline(model, dataloader, dataset_args)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/pipelines/sequential/pipeline.py", line 107, in __call__
LifecycleCallbacks.sequential_epoch_end(subgraph)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session_functions.py", line 165, in sequential_epoch_end
return cls.event(EventType.SEQUENTIAL_EPOCH_END, subgraph=subgraph, **kwargs)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session_functions.py", line 89, in event
return active_session().event(event_type, **kwargs)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session.py", line 187, in event
mod_data = self._lifecycle.event(
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/lifecycle.py", line 204, in event
data = mod.update_event(state=self.state, event=event, **kwargs)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/modifier.py", line 123, in update_event
self.on_event(state, event, **kwargs)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/autoround/base.py", line 179, in on_event
self.apply_autoround(state, subgraph)
File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/autoround/base.py", line 235, in apply_autoround
q_input, _ = ar.quantize_block(
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2498, in quantize_block
return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2552, in _quantize_block
output = self._get_block_outputs(
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 1808, in _get_block_outputs
tmp_input_ids, tmp_input_others = self._sampling_inputs(
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3173, in _sampling_inputs
current_input_others[key] = [input_others[key][i] for i in indices]
File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3173, in <listcomp>
current_input_others[key] = [input_others[key][i] for i in indices]
IndexError: index 1 is out of bounds for dimension 0 with size 1
Model quantization failed.
Metadata
Metadata
Assignees
Labels
autoroundFor any PR / issue related to autoround supportFor any PR / issue related to autoround support