Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
cadence: "nightly"
test_type: "regression"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
cadence: "nightly"
test_type: "regression"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed
222 changes: 135 additions & 87 deletions tests/llmcompressor/transformers/compression/test_run_compressed.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,182 @@
import copy
import shutil
import tempfile
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
COMPRESSED_LINEAR_CONFIG_DIR = (
"tests/llmcompressor/transformers/compression/run_compressed_configs"
)


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestDecompression(unittest.TestCase):
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR))
class Test_Decompressed_Linear_Uncompressed_Linear(unittest.TestCase):
"""
Check that HFQuantizer decompression is working as expected.
Manually decompress a compressed model and compare the generations
Uncompressed-Linear-forward decompressed-Linear-foward check

Decompression:
Given a skeleton model and path to the optimized model,
write the optimized model's safetensors to the skeleton model and decompress
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
Uncompressed: Optimized model saved as run_compressed=False, no need to decompress
Decompressed: Optimized model saved as run_compressed=True, and decompressed using
AutoModelForCausalLM decompression

AutoModelForCausalLM decompression diagram flow https://tinyurl.com/2ynb6wbu

"""

compressed_model_stub = None
skeleton_model_stub = None

SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]
uncompressed_model_stub = None

@classmethod
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()

# Decompress using HFQuantizer from AutoModelForCausalLM
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
quantization_config = CompressedTensorsConfig(run_compressed=False)

# Decompressed using HFQuantizer
# Linear foward
cls.decompressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
quantization_config=quantization_config,
)

# Manually decompress this model
self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
device_map=self.decompressed_model_hf_quantizer.device,
# Load model as is at the uncompressed state
# Linear forward
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
cls.uncompressed_model_stub,
torch_dtype=cls.decompressed_model.dtype,
device_map=cls.decompressed_model.device,
)

# decompression from HFQuantizer should populate weight_scale
assert hasattr(
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
"weight_scale",
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub)

# dense model should not have weight_scale populated
assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)
def test_compressed_matches_decompressed(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

decompressed_device = self.decompressed_model.device
uncompressed_device = self.uncompressed_model.device

config = AutoConfig.from_pretrained(self.compressed_model_stub)
# overwrite weights in cpu to cuda
self.decompressed_model = self.decompressed_model.to(decompressed_device)
self.uncompressed_model = self.uncompressed_model.to(uncompressed_device)

compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
decompressed_device
)

# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)
decompressed_output = self.decompressed_model.generate(**inputs, max_length=50)

# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)
inputs = inputs.to(uncompressed_device)

# self.dense_model should be decompressed
assert dense_model is not self.dense_model
uncompressed_output = self.uncompressed_model.generate(**inputs, max_length=50)

self.decompressed_model_manual = self.dense_model
for idx in range(len(SAMPLE_INPUT)):
assert torch.equal(decompressed_output[idx], uncompressed_output[idx])

assert hasattr(
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
"weight_scale",
)
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.decompressed_model
del cls.uncompressed_model
torch.cuda.empty_cache()


@requires_gpu
@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR))
class Test_Compressed_CompressedLinear_Decompressed_Linear(unittest.TestCase):
"""
Compressed-CompresesdLinear, Decompressed-Linear check

Compressed: Optimized model saved as run_compressed=True, no decompression
Decompressed: Optimized model saved as run_compressed=True, and decompressed using
AutoModelForCausalLM decompression

All compressed model should have CompressedLinear, which has its custom forward call

"""

compressed_model_stub = None

@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()

def test_hf_quantizer_decompress_match_manual_decompress(self):
manual_device = self.decompressed_model_manual.device
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
# Should have CompressedLinear modules
# Compressed Linear forward
cls.compressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
)

self.decompressed_model_manual = self.decompressed_model_manual.to(
manual_device
# Should just be linear modules
# Linear forward
quantization_config = CompressedTensorsConfig(run_compressed=False)
cls.decompressed_model = AutoModelForCausalLM.from_pretrained(
cls.compressed_model_stub,
torch_dtype=cls.compressed_model.dtype,
device_map=cls.compressed_model.device,
quantization_config=quantization_config,
)
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
decompressed_model_hf_quantizer

cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub)

def test_compressed_linear_modules_exist(self):
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
self.compressed_model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1

# some linear models are not compressed - ex. lm_head
assert compressed_linear_counts > 0

def test_compressed_matches_decompressed__hf_quantizer(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

decompressed_device = self.decompressed_model.device
compressed_device = self.compressed_model.device

# overwrite weights in cpu to cuda
self.decompressed_model = self.decompressed_model.to(decompressed_device)
self.compressed_model = self.compressed_model.to(compressed_device)

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
decompressed_device
)

for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.decompressed_model_manual.device
)
inputs = inputs.to(self.decompressed_model_manual.device)
decompressed_model_out = self.decompressed_model.generate(
**inputs, max_length=50
)

decompressed_model_manual_output = self.decompressed_model_manual.generate(
**inputs, max_length=50
)
inputs = inputs.to(compressed_device)

decompressed_model_hf_quantizer_out = (
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
)
compressed_model_out = self.compressed_model.generate(**inputs, max_length=50)

assert torch.equal(
decompressed_model_hf_quantizer_out, decompressed_model_manual_output
)
# Compare outputs for each input
for idx in range(len(SAMPLE_INPUT)):
torch.equal(compressed_model_out[idx], decompressed_model_out[idx])

@classmethod
def tearDownClass(self):
shutil.rmtree(self.test_dir)
del self.dense_model
del self.decompressed_model_hf_quantizer
del self.decompressed_model_manual
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.decompressed_model
del cls.compressed_model
torch.cuda.empty_cache()