Autoround quantization bug with GLM-4.5-Air

```
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Any, Dict, List
import argparse

# CUDA_VISIBLE_DEVICES=3 python /data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm_compressor.py
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from llmcompressor.modifiers.autoround import AutoRoundModifier
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", default="/data2/jcxy/llm_model/Qwen2.5-0.5B", type=str)
parser.add_argument("--quant_path", default="/data2/jcxy/llm_model/Qwen2.5-0.5B-AWQ", type=str)
parser.add_argument("--data_path", default="/data/jcxy/haolu/workspace/frameworks/arena-hard-simple/data/arena-hard-v2.0/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM.jsonl", type=str)
args = parser.parse_args()

model_path = args.model_path
print(model_path)
quant_path = args.quant_path
print(quant_path)
data_path = args.data_path
print(data_path)

# Select model and load it.
MODEL_ID = model_path

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select number of samples. 256 samples is a good place to start.
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 8192

# Load dataset and preprocess.
raw_dataset = load_dataset("json", data_files=data_path, split="train")
raw_dataset = raw_dataset.shuffle(seed=42)

def preprocess(data_point):
    full_conversation = data_point['messages']
    full_text = tokenizer.apply_chat_template(
        full_conversation,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": full_text}

# 1. Map to create the 'text' column
processed_dataset = raw_dataset.map(
    preprocess,
    num_proc=12,
    desc="Generating calibration text"
)

# 2. Remove all original columns.
original_columns = list(raw_dataset.features)
processed_dataset = processed_dataset.remove_columns(original_columns)

# 3. *** 这是修改的核心部分 ***
# 我们不再过滤掉超长的样本，而是将它们截断到 MAX_SEQUENCE_LENGTH。
def truncate_examples(batch):
    # tokenizer会处理一个批次的文本，并根据max_length进行截断
    tokenized_output = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
    )
    # 将截断后的token解码回文本字符串
    batch["text"] = tokenizer.batch_decode(tokenized_output["input_ids"], skip_special_tokens=True)
    return batch

processed_dataset = processed_dataset.map(
    truncate_examples,
    batched=True,  # 使用批处理以获得更快的速度
    num_proc=12,
    desc=f"Truncating samples to max length {MAX_SEQUENCE_LENGTH}"
)
# 3. Filter by length after creating the text and before passing to oneshot
# Note: This is less efficient than filtering before, but safer for complex pipelines.
# A better way is to check length inside preprocess and return None for long samples, then filter.
# For simplicity, we filter here.
processed_dataset = processed_dataset.filter(
    lambda example: len(tokenizer(example["text"]).input_ids) <= MAX_SEQUENCE_LENGTH,
    num_proc=12,
    desc=f"Filtering samples with length <= {MAX_SEQUENCE_LENGTH}"
)

# config_groups = {
#     "group_0": {
#         "targets": ["Linear"],
        # "input_activations": None,
        # "output_activations": None,
#         "weights": {
#             "num_bits": 8,
#             "type": "int",
#             "symmetric": True,
#             "strategy": "group",
#             "group_size": 128,
#         }
#     }
# }
# recipe = [
#     AWQModifier(
#         ignore=["lm_head", "re:.*mlp.gate$"],
#         config_groups=config_groups,
#     ),
# ]
# 定义 W8A16 最优配方
# 构建符合你要求的 W8A16 配方
# recipe = [
#     AWQModifier(
#         targets=["Linear"],
#         # 仅忽略最敏感的头和 Embedding，MoE 的 Expert 可以放心量化
#         ignore=['lm_head', 'model.embed_tokens', 're:.*input_layernorm$', 're:.*post_attention_layernorm$',
#         'model.norm', 're:.*shared_experts.*', 're:model.layers.0.*', 're:.*mlp.gate'], 
#         config_groups={
#             "group_0": {
#                 "targets": [
#                     "re:.*gate_proj.*", "re:.*up_proj.*", "re:.*down_proj.*",
#                     "re:.*k_proj.*", "re:.*q_proj.*", "re:.*v_proj.*", "re:.*o_proj.*"
#                 ],
#                 "weights": {
#                     "num_bits": 8,
#                     "type": "int",
#                     "symmetric": True,
#                     "group_size": 128,  # W8 黄金标准
#                     "strategy": "group",
#                     "dynamic": False,
#                     "actorder": None,
#                     "observer": "mse",
#                     "observer_kwargs": {}
#                 },
#                 "input_activations": None,
#                 "output_activations": None,
#                 "format": None
#             }
#         },
#         # 显式定义映射 (参考你提供的配置)
#         mappings=[
#             AWQMapping(smooth_layer="re:.*input_layernorm$", balance_layers=["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"]),
#             AWQMapping(smooth_layer="re:.*v_proj$", balance_layers=["re:.*o_proj$"]),
#             AWQMapping(smooth_layer="re:.*post_attention_layernorm$", balance_layers=["re:.*gate_proj$", "re:.*up_proj$"]),
#             AWQMapping(smooth_layer="re:.*up_proj$", balance_layers=["re:.*down_proj$"]),
#         ],
#         duo_scaling=True
#     )
# ]
# Configure the quantization algorithm to run.
recipe = AutoRoundModifier(
    targets="Linear", scheme="W4A16", ignore=['lm_head'], iters=200
)

# Apply algorithms.
oneshot(
    model=model,
    dataset=processed_dataset,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save to disk compressed.
SAVE_DIR = quant_path
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
```
___________
```
(haolu) haolu@xya800:/data/jcxy/haolu/workspace/training/alignment-handbook/scripts$ bash quantize-awq.sh 
Activating Conda environment: haolu
Starting model quantization (AWQ)...
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
  warnings.warn(
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
  warnings.warn(
[2025-11-30 08:56:30,725] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/neural_compressor/utils/utility.py:50: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  from pkg_resources import parse_version
2025-11-30 08:56:32 WARNING __init__.py L22: AutoScheme is currently supported only on Linux.
/data2/jcxy/llm_model/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM-E
/data2/jcxy/llm_model/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM-E-W4A16-G128-AutoRoundV3
/data/jcxy/haolu/workspace/frameworks/arena-hard-simple/data/arena-hard-v2.0/PsyLLMV4_5-Kairosa-251121-Instruct-SFT-GLM.jsonl
2025-11-30 08:56:32 WARNING modeling_utils.py L4670: `torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:49<00:00,  1.08it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
Tokenizing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 8800/8800 [01:28<00:00, 99.02 examples/s]
2025-11-30T08:59:01.192353+0800 | reset | INFO - Compression lifecycle reset
2025-11-30T08:59:01.192861+0800 | from_modifiers | INFO - Creating recipe from modifiers
2025-11-30T08:59:07.722660+0800 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-11-30T08:59:07.769677+0800 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `AutoRoundModifier`
2025-11-30T09:00:49.767301+0800 | get_sequential_targets | WARNING - Passing sequential targets through modifiers is deprecated, please use `oneshot(sequential_targets=...)`
Preparing cache: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:16<00:00,  7.89it/s]
(1/47): Calibrating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:08<00:00, 14.39it/s]
2025-11-30T09:01:30.256928+0800 | apply_autoround | INFO - Applying AutoRound on layer model.layers.0
2025-11-30 09:01:30 INFO base.py L365: using torch.bfloat16 for quantization tuning
Traceback (most recent call last):
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm_compressor-autoround.py", line 155, in <module>
    oneshot(
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 347, in oneshot
    one_shot()
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 172, in __call__
    self.apply_recipe_modifiers(
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 220, in apply_recipe_modifiers
    pipeline(
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/pipelines/independent/pipeline.py", line 45, in __call__
    pipeline(model, dataloader, dataset_args)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/pipelines/sequential/pipeline.py", line 107, in __call__
    LifecycleCallbacks.sequential_epoch_end(subgraph)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session_functions.py", line 165, in sequential_epoch_end
    return cls.event(EventType.SEQUENTIAL_EPOCH_END, subgraph=subgraph, **kwargs)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session_functions.py", line 89, in event
    return active_session().event(event_type, **kwargs)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/session.py", line 187, in event
    mod_data = self._lifecycle.event(
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/core/lifecycle.py", line 204, in event
    data = mod.update_event(state=self.state, event=event, **kwargs)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/modifier.py", line 123, in update_event
    self.on_event(state, event, **kwargs)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/autoround/base.py", line 179, in on_event
    self.apply_autoround(state, subgraph)
  File "/data/jcxy/haolu/workspace/training/alignment-handbook/scripts/llm-compressor/src/llmcompressor/modifiers/autoround/base.py", line 235, in apply_autoround
    q_input, _ = ar.quantize_block(
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2498, in quantize_block
    return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2552, in _quantize_block
    output = self._get_block_outputs(
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 1808, in _get_block_outputs
    tmp_input_ids, tmp_input_others = self._sampling_inputs(
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3173, in _sampling_inputs
    current_input_others[key] = [input_others[key][i] for i in indices]
  File "/data/jcxy/haolu/anaconda3/envs/haolu/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3173, in <listcomp>
    current_input_others[key] = [input_others[key][i] for i in indices]
IndexError: index 1 is out of bounds for dimension 0 with size 1
Model quantization failed.

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Autoround quantization bug with GLM-4.5-Air #2076

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Autoround quantization bug with GLM-4.5-Air #2076

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions