|
| 1 | +import torch |
| 2 | +from compressed_tensors.quantization import ( |
| 3 | + QuantizationArgs, |
| 4 | + QuantizationScheme, |
| 5 | + QuantizationStrategy, |
| 6 | +) |
1 | 7 | from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms |
2 | 8 | from compressed_tensors.transforms.transform_args import ( |
3 | 9 | ModuleTarget, |
4 | 10 | TransformationArgs, |
5 | 11 | ) |
6 | 12 | from compressed_tensors.transforms.transform_config import TransformationConfig |
7 | | -from compressed_tensors.transforms.transform_data import TransformData |
8 | 13 | from compressed_tensors.transforms.transform_scheme import TransformationScheme |
9 | 14 | from transformers import AutoModelForCausalLM, AutoTokenizer |
10 | | -import torch |
11 | 15 |
|
12 | | -ignore = ["re:*.mlp.down_proj$"] |
13 | | -module_targets = [ModuleTarget.WEIGHTS] |
| 16 | +from llmcompressor import oneshot |
| 17 | +from llmcompressor.modifiers.quantization import QuantizationModifier |
| 18 | + |
| 19 | +# U(W)V.T |
| 20 | + |
| 21 | +ignore = ["re:.*.mlp.down_proj$"] |
| 22 | +module_targets = [ModuleTarget.WEIGHT.value] |
14 | 23 |
|
15 | | -# Start with a processed |
16 | | -targets = ["Linear"] # 2048 * 2048 |
| 24 | +# Start with a processed |
| 25 | +targets = ["Linear"] # 2048 * 2048 |
17 | 26 | v_linear_args = TransformationArgs( |
18 | | - targets=targets, module_targets=module_targets, ignore=ignore, call_args={"transpose": True, "first": False} |
| 27 | + targets=targets, |
| 28 | + module_targets=module_targets, |
| 29 | + ignore=ignore, |
| 30 | + call_args={"transpose": True, "first": False}, |
19 | 31 | ) |
20 | 32 |
|
21 | | -targets = ["re:*.mlp.down_proj$"] # 5632 * 5632 |
| 33 | +targets = ["re:.*.mlp.down_proj$"] # 8192 * 8192 |
22 | 34 | v_down_proj = TransformationArgs( |
23 | | - targets=targets, module_targets=module_targets, call_args={"transpose": True, "first": False} |
| 35 | + targets=targets, |
| 36 | + module_targets=module_targets, |
| 37 | + call_args={"transpose": True, "first": False}, |
24 | 38 | ) |
25 | 39 |
|
26 | | -targets = ["re:*.attn.q_proj$", "re:*.attn.o_proj$", "re:*.mlp.down_proj$"] # 2048 * 2048 |
| 40 | +targets = [ |
| 41 | + "re:.*.attn.q_proj$", |
| 42 | + "re:.*.attn.o_proj$", |
| 43 | + "re:.*.mlp.down_proj$", |
| 44 | +] # 2048 * 2048 |
27 | 45 | u_q_o_down_proj = TransformationArgs( |
28 | | - targets=targets, module_targets=module_targets, |
| 46 | + targets=targets, |
| 47 | + module_targets=module_targets, |
29 | 48 | ) |
30 | 49 |
|
31 | | -targets = ["re:*.attn.gate_proj$", "re:*.mlp.up_proj$"] # 5632 * 5632 |
| 50 | +targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"] # 8192 * 8192 |
32 | 51 | u_gate_up_proj = TransformationArgs( |
33 | | - targets=targets, module_targets=module_targets, |
| 52 | + targets=targets, |
| 53 | + module_targets=module_targets, |
34 | 54 | ) |
35 | 55 |
|
36 | | -targets = ["re:*.attn.k_proj$", "re:*.attn.v_proj$"] # 256 * 256 |
| 56 | +targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"] # 512 * 512 |
37 | 57 | u_k_v_proj = TransformationArgs( |
38 | | - targets=targets, module_targets=module_targets, |
| 58 | + targets=targets, |
| 59 | + module_targets=module_targets, |
39 | 60 | ) |
40 | 61 |
|
41 | 62 |
|
|
51 | 72 | v_scheme_down_proj = TransformationScheme( |
52 | 73 | transform_type="random-hadamard", |
53 | 74 | groups=[v_down_proj], |
54 | | - transform_creation_args={"size": 5632}, |
| 75 | + transform_creation_args={"size": 8192}, |
55 | 76 | ) |
56 | 77 |
|
57 | 78 | # We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms |
|
64 | 85 | u_scheme_gate_up_proj = TransformationScheme( |
65 | 86 | transform_type="random-hadamard", |
66 | 87 | groups=[u_gate_up_proj], |
67 | | - transform_creation_args={"size": 5632}, |
| 88 | + transform_creation_args={"size": 8192}, |
68 | 89 | ) |
69 | 90 |
|
70 | 91 | u_scheme_k_v_proj = TransformationScheme( |
71 | 92 | transform_type="random-hadamard", |
72 | 93 | groups=[u_k_v_proj], |
73 | | - transform_creation_args={"size": 256}, |
| 94 | + transform_creation_args={"size": 512}, |
74 | 95 | ) |
75 | 96 |
|
76 | 97 | # QuIP Recipe with weight only quantization |
77 | 98 | config = TransformationConfig( |
78 | 99 | transform_groups={ |
79 | 100 | "u_transform_q_o_down_proj": u_scheme_q_o_down_proj, |
80 | | - "u_transform_gate_up_proj": u_scheme_gate_up_proj, |
81 | 101 | "u_transform_k_v_proj": u_scheme_k_v_proj, |
| 102 | + "u_transform_gate_up_proj": u_scheme_gate_up_proj, |
82 | 103 | "v_transform_linear": v_scheme, |
83 | | - "v_transform_down_proj": v_scheme_down_proj |
| 104 | + "v_transform_down_proj": v_scheme_down_proj, |
84 | 105 | } |
85 | 106 | ) |
86 | 107 |
|
87 | | -#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" |
88 | | -MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
| 108 | +recipe = QuantizationModifier( |
| 109 | + targets="Linear", |
| 110 | + ignore=["lm_head"], |
| 111 | + config_groups={ |
| 112 | + "group_0": QuantizationScheme( |
| 113 | + targets=["Linear"], |
| 114 | + weights=QuantizationArgs( |
| 115 | + num_bits=4, |
| 116 | + symmetric=True, |
| 117 | + strategy=QuantizationStrategy.GROUP, |
| 118 | + group_size=128, |
| 119 | + ), |
| 120 | + ) |
| 121 | + }, |
| 122 | + transforms_config=config, |
| 123 | +) |
| 124 | + |
| 125 | +MODEL_ID = "meta-llama/Llama-3.2-1B" |
89 | 126 |
|
90 | 127 | model = AutoModelForCausalLM.from_pretrained( |
91 | | - MODEL_ID, |
92 | | - device_map="auto", |
93 | | - torch_dtype="auto", |
| 128 | + MODEL_ID, device_map="auto", torch_dtype="auto" |
94 | 129 | ) |
95 | 130 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| 131 | + |
| 132 | +oneshot(model=model, recipe=recipe) |
| 133 | + |
| 134 | +print("\n\n") |
| 135 | +print("========== SAMPLE GENERATION ==============") |
| 136 | +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") |
| 137 | +output = model.generate(input_ids, max_new_tokens=100) |
| 138 | +print(tokenizer.decode(output[0])) |
| 139 | +print("==========================================\n\n") |
| 140 | + |
| 141 | +# Save to disk compressed. |
| 142 | +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms" |
| 143 | +model.save_pretrained(SAVE_DIR) |
| 144 | +tokenizer.save_pretrained(SAVE_DIR) |
| 145 | + |
| 146 | +""" |
96 | 147 | x = model.model.layers[0] |
97 | 148 | attn = x.self_attn |
98 | 149 | mlp = x.mlp |
|
104 | 155 | attn.o_proj, |
105 | 156 | mlp.gate_proj, |
106 | 157 | mlp.down_proj, |
107 | | - mlp.up_proj |
| 158 | + mlp.up_proj, |
108 | 159 | ] |
109 | 160 |
|
110 | | -for layer in layers: |
| 161 | +from compressed_tensors.transforms.hadamard_utils import ( |
| 162 | + deterministic_hadamard_matrix, |
| 163 | + random_hadamard_matrix, |
| 164 | +) |
111 | 165 |
|
| 166 | +for layer in layers: |
112 | 167 | current_weight = layer.weight |
| 168 | + original_weight = current_weight.data.clone() |
113 | 169 | (n, m) = current_weight.shape |
114 | | - U = torch.eye(n).to("cuda").to(torch.bfloat16) |
115 | | - V = torch.eye(m).to("cuda").to(torch.bfloat16) |
116 | | - print(n, layer) |
| 170 | +
|
| 171 | + U = torch.Tensor(random_hadamard_matrix(n)).to("cuda").to(torch.float32) |
| 172 | + V = torch.Tensor(random_hadamard_matrix(m)).to("cuda").to(torch.float32) |
117 | 173 |
|
118 | 174 | output = torch.matmul(U, current_weight) |
119 | 175 | output = torch.matmul(output, V.T) |
| 176 | +
|
| 177 | + # apply untransform |
| 178 | + x = torch.matmul(U.T, torch.matmul(output, V)) |
| 179 | + print(torch.max(abs(x - original_weight))) |
| 180 | +""" |
0 commit comments