diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml index 51d9ec25b2..4fd75281c1 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml @@ -1,4 +1,4 @@ -cadence: "commit" +cadence: "nightly" test_type: "regression" compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml index 3c1646b16f..dd2134011f 100644 --- a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml @@ -1,4 +1,4 @@ -cadence: "commit" +cadence: "nightly" test_type: "regression" compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index b18cba80e5..751fd37acc 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -1,134 +1,182 @@ -import copy import shutil import tempfile import unittest import torch -from compressed_tensors import QUANTIZATION_CONFIG_NAME -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization import QuantizationStatus +from compressed_tensors.linear.compressed_linear import CompressedLinear +from compressed_tensors.quantization.utils import iter_named_leaf_modules from parameterized import parameterized_class -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig from tests.testing_utils import parse_params, requires_gpu -CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" +COMPRESSED_LINEAR_CONFIG_DIR = ( + "tests/llmcompressor/transformers/compression/run_compressed_configs" +) @requires_gpu -@parameterized_class(parse_params(CONFIG_DIR)) -class TestDecompression(unittest.TestCase): +@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) +class Test_Decompressed_Linear_Uncompressed_Linear(unittest.TestCase): """ - Check that HFQuantizer decompression is working as expected. - Manually decompress a compressed model and compare the generations + Uncompressed-Linear-forward decompressed-Linear-foward check - Decompression: - Given a skeleton model and path to the optimized model, - write the optimized model's safetensors to the skeleton model and decompress - Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 + Uncompressed: Optimized model saved as run_compressed=False, no need to decompress + Decompressed: Optimized model saved as run_compressed=True, and decompressed using + AutoModelForCausalLM decompression + + AutoModelForCausalLM decompression diagram flow https://tinyurl.com/2ynb6wbu """ compressed_model_stub = None - skeleton_model_stub = None - - SAMPLE_INPUTS = [ - "I love 4-bit quantization because", - "What is the capital of France?", - "def fibonacci(n):", - ] + uncompressed_model_stub = None @classmethod - def setUpClass(self): - self.test_dir = tempfile.mkdtemp() - self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) + def setUpClass(cls): + cls.test_dir = tempfile.mkdtemp() - # Decompress using HFQuantizer from AutoModelForCausalLM - self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( - self.compressed_model_stub, + quantization_config = CompressedTensorsConfig(run_compressed=False) + + # Decompressed using HFQuantizer + # Linear foward + cls.decompressed_model = AutoModelForCausalLM.from_pretrained( + cls.compressed_model_stub, torch_dtype="auto", device_map="auto", - quantization_config=CompressedTensorsConfig(run_compressed=False), + quantization_config=quantization_config, ) - # Manually decompress this model - self.dense_model = AutoModelForCausalLM.from_pretrained( - self.skeleton_model_stub, - torch_dtype=self.decompressed_model_hf_quantizer.dtype, - device_map=self.decompressed_model_hf_quantizer.device, + # Load model as is at the uncompressed state + # Linear forward + cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( + cls.uncompressed_model_stub, + torch_dtype=cls.decompressed_model.dtype, + device_map=cls.decompressed_model.device, ) - # decompression from HFQuantizer should populate weight_scale - assert hasattr( - self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj, - "weight_scale", - ) + cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) - # dense model should not have weight_scale populated - assert not hasattr( - self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" - ) + def test_compressed_matches_decompressed(self): + SAMPLE_INPUT = [ + "I love 4-bit quantization because", + "What is the capital of France?", + "def fibonacci(n):", + ] + + decompressed_device = self.decompressed_model.device + uncompressed_device = self.uncompressed_model.device - config = AutoConfig.from_pretrained(self.compressed_model_stub) + # overwrite weights in cpu to cuda + self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.uncompressed_model = self.uncompressed_model.to(uncompressed_device) - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - self.compressor = ModelCompressor.from_compression_config(compression_config) - self.compressor.quantization_config.quantization_status = ( - QuantizationStatus.FROZEN + inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( + decompressed_device ) - # use the model_path to load the decompressed weights into dense_model - dense_model = copy.deepcopy(self.dense_model) + decompressed_output = self.decompressed_model.generate(**inputs, max_length=50) - # overwrite the weights of the dense model - self.compressor.decompress( - model_path=self.compressed_model_stub, - model=self.dense_model, - ) + inputs = inputs.to(uncompressed_device) - # self.dense_model should be decompressed - assert dense_model is not self.dense_model + uncompressed_output = self.uncompressed_model.generate(**inputs, max_length=50) - self.decompressed_model_manual = self.dense_model + for idx in range(len(SAMPLE_INPUT)): + assert torch.equal(decompressed_output[idx], uncompressed_output[idx]) - assert hasattr( - self.decompressed_model_manual.model.layers[0].self_attn.q_proj, - "weight_scale", - ) + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.test_dir) + del cls.decompressed_model + del cls.uncompressed_model + torch.cuda.empty_cache() + + +@requires_gpu +@parameterized_class(parse_params(COMPRESSED_LINEAR_CONFIG_DIR)) +class Test_Compressed_CompressedLinear_Decompressed_Linear(unittest.TestCase): + """ + Compressed-CompresesdLinear, Decompressed-Linear check + + Compressed: Optimized model saved as run_compressed=True, no decompression + Decompressed: Optimized model saved as run_compressed=True, and decompressed using + AutoModelForCausalLM decompression + + All compressed model should have CompressedLinear, which has its custom forward call + + """ + + compressed_model_stub = None + + @classmethod + def setUpClass(cls): + cls.test_dir = tempfile.mkdtemp() - def test_hf_quantizer_decompress_match_manual_decompress(self): - manual_device = self.decompressed_model_manual.device - decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device + # Should have CompressedLinear modules + # Compressed Linear forward + cls.compressed_model = AutoModelForCausalLM.from_pretrained( + cls.compressed_model_stub, + torch_dtype="auto", + device_map="auto", + ) - self.decompressed_model_manual = self.decompressed_model_manual.to( - manual_device + # Should just be linear modules + # Linear forward + quantization_config = CompressedTensorsConfig(run_compressed=False) + cls.decompressed_model = AutoModelForCausalLM.from_pretrained( + cls.compressed_model_stub, + torch_dtype=cls.compressed_model.dtype, + device_map=cls.compressed_model.device, + quantization_config=quantization_config, ) - self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( - decompressed_model_hf_quantizer + + cls.tokenizer = AutoTokenizer.from_pretrained(cls.compressed_model_stub) + + def test_compressed_linear_modules_exist(self): + compressed_linear_counts = 0 + for _, submodule in iter_named_leaf_modules( + self.compressed_model, + ): + if isinstance(submodule, CompressedLinear): + compressed_linear_counts += 1 + + # some linear models are not compressed - ex. lm_head + assert compressed_linear_counts > 0 + + def test_compressed_matches_decompressed__hf_quantizer(self): + SAMPLE_INPUT = [ + "I love 4-bit quantization because", + "What is the capital of France?", + "def fibonacci(n):", + ] + + decompressed_device = self.decompressed_model.device + compressed_device = self.compressed_model.device + + # overwrite weights in cpu to cuda + self.decompressed_model = self.decompressed_model.to(decompressed_device) + self.compressed_model = self.compressed_model.to(compressed_device) + + inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( + decompressed_device ) - for input in self.SAMPLE_INPUTS: - inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( - self.decompressed_model_manual.device - ) - inputs = inputs.to(self.decompressed_model_manual.device) + decompressed_model_out = self.decompressed_model.generate( + **inputs, max_length=50 + ) - decompressed_model_manual_output = self.decompressed_model_manual.generate( - **inputs, max_length=50 - ) + inputs = inputs.to(compressed_device) - decompressed_model_hf_quantizer_out = ( - self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) - ) + compressed_model_out = self.compressed_model.generate(**inputs, max_length=50) - assert torch.equal( - decompressed_model_hf_quantizer_out, decompressed_model_manual_output - ) + # Compare outputs for each input + for idx in range(len(SAMPLE_INPUT)): + torch.equal(compressed_model_out[idx], decompressed_model_out[idx]) @classmethod - def tearDownClass(self): - shutil.rmtree(self.test_dir) - del self.dense_model - del self.decompressed_model_hf_quantizer - del self.decompressed_model_manual + def tearDownClass(cls): + shutil.rmtree(cls.test_dir) + del cls.decompressed_model + del cls.compressed_model + torch.cuda.empty_cache()