Hi @LugerW-A , had some time to dig in today. I've created a PR to allow for activation quantization:

[AWQ] Allow for activation quantization #1682

Can you check out branch bdellabe/awq-w4a8 and try this out? A quick preliminary check with "meta-llama/Llama-3.2-3B-Instruct" seems promising 👀 :

Wikitext PPL (lower is better)
W4AFP8 AWQ+Quant --  13.0959
W4AFP8 Quant -- 13.2527
W4A16 AWQ+Quant --  13.1008
W4A16 Quant -- 13.2668

We have a W4A8 preset quantization scheme, which uses integer quantization, but no W4FP8 scheme. For that you can replace scheme with a config_groups field like so:

from compressed_tensors.quantization import (
    QuantizationArgs,
    QuantizationScheme,
    QuantizationStrategy,
    QuantizationType,
)
recipe = [
    AWQModifier(
        ...
        config_groups={
            "group_0": QuantizationScheme(
                targets=["Linear"],
                weights=QuantizationArgs(
                    num_bits=4,
                    type=QuantizationType.INT,
                    group_size=128,
                    strategy=QuantizationStrategy.GROUP,
                    symmetric=True,
                    dynamic=False,
                ),
                input_activations=QuantizationArgs(
                    num_bits=8,
                    type=QuantizationType.FLOAT,
                    strategy=QuantizationStrategy.TOKEN,
                    symmetric=True,
                    dynamic=True,
                    observer=None,
                ),
            )
        },
    )
]

I unfortunately wasn't able to get the W4AFP4 model to load up in vllm though, it says not compressed-tensors compatible format was found. I'll ask around about the status of W4A8 support, I could very well be doing something wrong.

Full script I'm using:

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from llmcompressor import oneshot, active_session
from llmcompressor.utils import dispatch_for_generation
from llmcompressor.modifiers.awq import AWQModifier, AWQMapping
from llmcompressor.modifiers.quantization import QuantizationModifier
from compressed_tensors.quantization import (
    QuantizationArgs,
    QuantizationScheme,
    QuantizationStrategy,
    QuantizationType,
)

# Select model and load it.
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"


# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        # scheme="W4A16",
        config_groups={
            "group_0": QuantizationScheme(
                targets=["Linear"],
                weights=QuantizationArgs(
                    num_bits=4,
                    type=QuantizationType.INT,
                    group_size=128,
                    strategy=QuantizationStrategy.GROUP,
                    symmetric=True,
                    dynamic=False,
                ),
                input_activations=QuantizationArgs(
                    num_bits=8,
                    type=QuantizationType.FLOAT,
                    strategy=QuantizationStrategy.TOKEN,
                    symmetric=True,
                    dynamic=True,
                    observer=None,
                ),
            )
        },
        targets=["Linear"],
    ),
]

# Select calibration dataset.
DATASET_ID = "mit-han-lab/pile-val-backup"
DATASET_SPLIT = "validation"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 128  # 256
MAX_SEQUENCE_LENGTH = 512


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )

    def preprocess(example):
        return {"input_ids": tokenizer.encode(example["text"].strip())}

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )

    # Confirm generations of the quantized model look sane.
    dispatch_for_generation(model)
    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))
    print("==========================================\n\n")

    # Save to disk compressed.
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)

    ###
    #### Apply algorithms.
    ###

    ### LM EVAL

    active_session().reset()
    del model
    del tokenizer
    torch.cuda.empty_cache()

    import lm_eval
    from lm_eval.utils import make_table

    results = lm_eval.simple_evaluate(
        model="hf",
        model_args={
            "pretrained": SAVE_DIR,
            "add_bos_token": True,
            "dtype": "bfloat16",
        },
        tasks=["wikitext"],
        num_fewshot=5,
        batch_size=128,
    )
    print(make_table(results))

W4fp8 AWQ #1657

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions