-
Notifications
You must be signed in to change notification settings - Fork 466
Expand file tree
/
Copy pathtest_decompress.py
More file actions
136 lines (108 loc) · 4.78 KB
/
test_decompress.py
File metadata and controls
136 lines (108 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import copy
import os
import shutil
import tempfile
import unittest
import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
from tests.testing_utils import parse_params, requires_gpu
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestDecompression(unittest.TestCase):
"""
Check that HFQuantizer decompression is working as expected.
Manually decompress a compressed model and compare the generations
Decompression:
Given a skeleton model and path to the optimized model,
write the optimized model's safetensors to the skeleton model and decompress
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
"""
compressed_model_stub = None
skeleton_model_stub = None
SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]
@classmethod
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
# Decompress using HFQuantizer from AutoModelForCausalLM
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
# Manually decompress this model
self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
device_map=self.decompressed_model_hf_quantizer.device,
)
# decompression from HFQuantizer should populate weight_scale
assert hasattr(
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
"weight_scale",
)
# dense model should not have weight_scale populated
assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)
config = AutoConfig.from_pretrained(self.compressed_model_stub)
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)
# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)
# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)
# self.dense_model should be decompressed
assert dense_model is not self.dense_model
self.decompressed_model_manual = self.dense_model
assert hasattr(
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
"weight_scale",
)
def test_hf_quantizer_decompress_match_manual_decompress(self):
manual_device = self.decompressed_model_manual.device
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
self.decompressed_model_manual = self.decompressed_model_manual.to(
manual_device
)
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
decompressed_model_hf_quantizer
)
for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.decompressed_model_manual.device
)
inputs = inputs.to(self.decompressed_model_manual.device)
decompressed_model_manual_output = self.decompressed_model_manual.generate(
**inputs, max_length=50
)
decompressed_model_hf_quantizer_out = (
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
)
assert torch.equal(
decompressed_model_hf_quantizer_out, decompressed_model_manual_output
)
@classmethod
def tearDownClass(self):
if os.path.isdir(self.test_dir):
shutil.rmtree(self.test_dir)
del self.dense_model
del self.decompressed_model_hf_quantizer
del self.decompressed_model_manual