Skip to content

Commit f1cc987

Browse files
committed
cleanup
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 4cab29e commit f1cc987

File tree

3 files changed

+47
-58
lines changed

3 files changed

+47
-58
lines changed

examples/transform/llama3_example.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22
from transformers import AutoModelForCausalLM, AutoTokenizer
33

44
from llmcompressor import oneshot
5-
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
5+
from llmcompressor.modifiers.quantization import QuantizationModifier
66
from llmcompressor.modifiers.transform import SpinQuantModifier
77
from llmcompressor.utils import dispatch_for_generation
88

99
# Select model and load it.
10-
# MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
11-
# MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" # TODO hidden size 3072 causes failure when creating hadamard
1210
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
1311

1412
model = AutoModelForCausalLM.from_pretrained(
@@ -57,36 +55,32 @@ def tokenize(sample):
5755
ds = ds.map(tokenize, remove_columns=ds.column_names)
5856

5957
# Configure the quantization algorithm to run.
58+
# * apply spinquant transforms to model in order to make quantization easier
6059
# * quantize the weights to 4 bit with GPTQ with a group size 128
6160
recipe = [
62-
# TODO preset_config="QUIP_ONLINE" outputs gibberish
63-
# preset_config="QUIP" output sensible, but cannot load saved
64-
# checkpoint or run evals (~4hrs to run)
65-
SpinQuantModifier(rotations=["R1", "R2"]),
66-
# QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
61+
SpinQuantModifier(rotations=["R1", "R2"], transform_type="random-hadamard"),
62+
QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
6763
]
6864

6965
# Apply algorithms.
7066
oneshot(
7167
model=model,
7268
recipe=recipe,
73-
# dataset=ds,
74-
pipeline="datafree",
75-
# max_seq_length=MAX_SEQUENCE_LENGTH,
76-
# num_calibration_samples=NUM_CALIBRATION_SAMPLES,
77-
log_dir=None,
69+
dataset=ds,
70+
max_seq_length=MAX_SEQUENCE_LENGTH,
71+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7872
)
7973

80-
# # Confirm generations of the quantized model look sane.
74+
# Confirm generations of the quantized model look sane.
8175
print("\n\n")
8276
print("========== SAMPLE GENERATION ==============")
8377
dispatch_for_generation(model)
8478
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
8579
output = model.generate(input_ids, max_new_tokens=100)
8680
print(tokenizer.decode(output[0]))
87-
# print("==========================================\n\n")
81+
print("==========================================\n\n")
8882

89-
# # Save to disk compressed.
90-
# SAVE_DIR = MODEL_ID.split("/")[1] + "-transform-quant-w4a16"
91-
# model.save_pretrained(SAVE_DIR, save_compressed=True)
92-
# tokenizer.save_pretrained(SAVE_DIR)
83+
# Save to disk compressed.
84+
SAVE_DIR = MODEL_ID.split("/")[1] + "-transformed-w4a16"
85+
model.save_pretrained(SAVE_DIR, save_compressed=True)
86+
tokenizer.save_pretrained(SAVE_DIR)

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,7 @@ def __init__(
125125
self.output_dir = output_dir
126126

127127
# initialize the model and processor
128-
# TODO Remove Comment before merge, this is just needed for DummyModel
129-
# pre_process(model_args)
128+
pre_process(model_args)
130129

131130
# Set instance attributes
132131
self.model = self.model_args.model

examples/transform/spinquant_dummy.py renamed to tests/llmcompressor/modifiers/transform/test_dummy_model.py

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
num_embeddings = 12
1515

1616

17-
# TODO remove file before merging
18-
19-
2017
class DummySelfAttn(torch.nn.Module):
2118
def __init__(self, hidden_dim, intermediate_dim):
2219
super().__init__()
@@ -75,37 +72,36 @@ def forward(self, input_ids):
7572
return self.lm_head(x)
7673

7774

78-
model = DummyModel(num_embeddings, hidden_dim, intermediate_dim, up_dim)
79-
80-
# TODO Uncomment this to see norm diff > 1e-6
81-
# This is due to issue Kyle spotted in https://arxiv.org/pdf/2405.16406 Page 5 Footnote 2
82-
# Will have to fuse layernorms with subsequent layers so that input_layernorm.weight is equal to torch.ones() (this apparently makes it rotation invariant)
83-
# https://github.com/facebookresearch/SpinQuant/blob/8f47aa3f00e8662caf1a484153920a07e5281c3a/utils/fuse_norm_utils.py#L39
84-
# update_parameter_data(
85-
# model.input_layernorm,
86-
# torch.rand(model.input_layernorm.weight.shape),
87-
# "weight",
88-
# )
89-
90-
input_ids = torch.IntTensor([1, 2, 3, 4, 5])
91-
orig_output = model(input_ids)
92-
93-
recipe = [
94-
# NOTE: preset_config="QUIP" output sensible, but cannot load saved
95-
# checkpoint or run evals (~4hrs to run)
96-
SpinQuantModifier(rotations=["R1", "R2"]),
97-
# QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
98-
]
99-
100-
oneshot(
101-
model=model,
102-
recipe=recipe,
103-
pipeline="datafree",
104-
log_dir=None,
105-
)
106-
107-
# # Confirm generations of the quantized model look the same
108-
transformed_output = model(input_ids)
109-
110-
print(f"Norm Diff {(orig_output-transformed_output).norm()}")
111-
print(f"Norm {orig_output.norm()}, {transformed_output.norm()}")
75+
def test_dummy_model():
76+
model = DummyModel(num_embeddings, hidden_dim, intermediate_dim, up_dim)
77+
78+
# TODO Uncomment this to see norm diff > 1e-6
79+
# This is due to issue Kyle spotted in https://arxiv.org/pdf/2405.16406 Page 5 Footnote 2
80+
# Will have to fuse layernorms with subsequent layers so that input_layernorm.weight is equal to torch.ones() (this apparently makes it rotation invariant)
81+
# https://github.com/facebookresearch/SpinQuant/blob/8f47aa3f00e8662caf1a484153920a07e5281c3a/utils/fuse_norm_utils.py#L39
82+
# update_parameter_data(
83+
# model.input_layernorm,
84+
# torch.rand(model.input_layernorm.weight.shape),
85+
# "weight",
86+
# )
87+
88+
input_ids = torch.IntTensor([1, 2, 3, 4, 5])
89+
orig_output = model(input_ids)
90+
91+
recipe = [
92+
SpinQuantModifier(rotations=["R1", "R2"]),
93+
]
94+
95+
# TODO: work around preprocessing?
96+
oneshot(
97+
model=model,
98+
recipe=recipe,
99+
pipeline="datafree",
100+
log_dir=None,
101+
)
102+
103+
# # Confirm generations of the quantized model look the same
104+
transformed_output = model(input_ids)
105+
106+
print(f"Norm Diff {(orig_output-transformed_output).norm()}")
107+
print(f"Norm {orig_output.norm()}, {transformed_output.norm()}")

0 commit comments

Comments
 (0)