Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion test/test_cuda/integrations/test_llmc_integration.py

This file was deleted.

239 changes: 239 additions & 0 deletions test/test_xpu/test_llmc_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import pytest
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
from llmcompressor import oneshot
from llmcompressor.modifiers.autoround import AutoRoundModifier
from transformers import AutoModelForCausalLM, AutoTokenizer

from auto_round.calib_dataset import get_dataset

recipe_str = """
quant_stage:
quant_modifiers:
AutoRoundModifier:
ignore: ["lm_head"]
iters: 10
config_groups:
group_0:
targets:
- "Linear"
input_activations: null
output_activations: null
weights:
num_bits: 4
type: "int"
symmetric: true
strategy: group
group_size: 128
"""

recipe_modifier_full = AutoRoundModifier(
ignore=["lm_head"],
iters=10,
config_groups={
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
)
},
)
recipe_modifier_nvfp4 = AutoRoundModifier(
ignore=["lm_head"],
iters=2,
scheme="NVFP4",
)

recipe_modifier_mxfp4 = AutoRoundModifier(
ignore=["lm_head"],
iters=0,
scheme="MXFP4",
)

w8a8_dynamic_recipe_modifier = AutoRoundModifier(
ignore=["lm_head"],
iters=0,
config_groups={
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"),
input_activations=QuantizationArgs(num_bits=8, type="float", strategy="token", dynamic=True),
)
},
)

w8a8_static_recipe_modifier = AutoRoundModifier(
ignore=["lm_head"],
iters=0,
config_groups={
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"),
input_activations=QuantizationArgs(num_bits=8, type="float", strategy="tensor"),
)
},
)


@pytest.mark.skipif(torch.xpu.device_count() < 1, reason="test requires at least 1 XPU")
@pytest.mark.parametrize(
"recipe",
[
recipe_str,
recipe_modifier_full,
recipe_modifier_nvfp4,
recipe_modifier_mxfp4,
],
)
def test_oneshot_application(recipe, tmp_path):
output = tmp_path / "oneshot_output"
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model)
dataset = get_dataset(
tokenizer=tokenizer,
seqlen=1024,
nsamples=32,
)

device = "xpu:0" if torch.xpu.is_available() else "cpu"

oneshot(
model=model,
dataset=dataset,
output_dir=output,
recipe=recipe,
)
model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)

# Check that the model is quantized
# decompress() will attach a quantization_config to the model
# as we decompress right away
quantization_config = model_loaded.config.quantization_config.quantization_config
assert quantization_config is not None

# check config is set properly
assert "lm_head" in quantization_config.ignore
assert len(quantization_config.config_groups) == 1
quant_scheme = quantization_config.config_groups["group_0"]
assert isinstance(quant_scheme, QuantizationScheme)

weight_args = quantization_config.config_groups["group_0"].weights
assert isinstance(weight_args, QuantizationArgs)
assert weight_args.num_bits == 4

# Check a specific layer is quantized
targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
assert hasattr(targeted_linear_layer, "quantization_scheme")

# Check lm-head is not quantized
not_targeted = model_loaded.lm_head
assert not hasattr(not_targeted, "quantization_scheme")


@pytest.mark.skipif(torch.xpu.device_count() < 2, reason="test requires at least 2 XPUs")
def test_oneshot_with_device_ids(tmp_path):
output = tmp_path / "oneshot_output"
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model)
dataset = get_dataset(
tokenizer=tokenizer,
seqlen=512,
nsamples=4,
)

device = "xpu:0"

recipe = AutoRoundModifier(
ignore=["lm_head"],
iters=10,
config_groups={
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
)
},
device_ids="0,1",
)

oneshot(
model=model,
dataset=dataset,
output_dir=output,
recipe=recipe,
)
model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)

# Check that the model is quantized
# decompress() will attach a quantization_config to the model
# as we decompress right away
quantization_config = model_loaded.config.quantization_config.quantization_config
assert quantization_config is not None

# check config is set properly
assert "lm_head" in quantization_config.ignore
assert len(quantization_config.config_groups) == 1
quant_scheme = quantization_config.config_groups["group_0"]
assert isinstance(quant_scheme, QuantizationScheme)

weight_args = quantization_config.config_groups["group_0"].weights
assert isinstance(weight_args, QuantizationArgs)
assert weight_args.num_bits == 4

# Check a specific layer is quantized
targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
assert hasattr(targeted_linear_layer, "quantization_scheme")

# Check lm-head is not quantized
not_targeted = model_loaded.lm_head
assert not hasattr(not_targeted, "quantization_scheme")


@pytest.mark.skipif(torch.xpu.device_count() < 1, reason="test requires at least 1 XPU")
@pytest.mark.parametrize(
"recipe",
[w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier],
)
def test_rtn_oneshot(recipe, tmp_path):
output = tmp_path / "oneshot_output"
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model)
dataset = get_dataset(
tokenizer=tokenizer,
seqlen=1024,
nsamples=32,
)

device = "xpu:0"

oneshot(
model=model,
dataset=dataset,
output_dir=output,
recipe=recipe,
)
model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)

quantization_config = model_loaded.config.quantization_config.quantization_config
assert quantization_config is not None

# check config is set properly
assert "lm_head" in quantization_config.ignore
assert len(quantization_config.config_groups) == 1
quant_scheme = quantization_config.config_groups["group_0"]
assert isinstance(quant_scheme, QuantizationScheme)

weight_args = quantization_config.config_groups["group_0"].weights
act_args = quantization_config.config_groups["group_0"].input_activations
assert isinstance(weight_args, QuantizationArgs)
assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits
assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy
if act_args is not None:
assert act_args.num_bits == recipe.config_groups["group_0"].input_activations.num_bits
assert act_args.strategy == recipe.config_groups["group_0"].input_activations.strategy

# Check a specific layer is quantized
targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
assert hasattr(targeted_linear_layer, "quantization_scheme")

# Check lm-head is not quantized
not_targeted = model_loaded.lm_head
assert not hasattr(not_targeted, "quantization_scheme")
Loading