Skip to content

Commit 6d942cc

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading
2 parents 819df1c + 1648528 commit 6d942cc

File tree

9 files changed

+206
-12
lines changed

9 files changed

+206
-12
lines changed

CITATION.cff

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cff-version: 1.2.0
2+
message: "If you use this software, please cite it as below."
3+
authors:
4+
- name: Red Hat AI
5+
- name: vLLM Project
6+
title: "LLM Compressor"
7+
date-released: 2024-08-08
8+
url: https://github.com/vllm-project/llm-compressor

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,17 @@ output = model.generate("My name is")
120120

121121
- If you have any questions or requests open an [issue](https://github.com/vllm-project/llm-compressor/issues) and we will add an example or documentation.
122122
- We appreciate contributions to the code, examples, integrations, and documentation as well as bug reports and feature requests! [Learn how here](CONTRIBUTING.md).
123+
124+
## Citation
125+
126+
If you find LLM Compressor useful in your research or projects, please consider citing it:
127+
128+
```bibtex
129+
@software{llmcompressor2024,
130+
title={{LLM Compressor}},
131+
author={Red Hat AI and vLLM Project},
132+
year={2024},
133+
month={8},
134+
url={https://github.com/vllm-project/llm-compressor},
135+
}
136+
```

examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@
8888
tokenizer.save_pretrained(f"{output_dir}/quantization_stage")
8989

9090
logger.info(
91-
"llmcompressor does not currently support running ",
91+
"llmcompressor does not currently support running "
9292
"compressed models in the marlin24 format. "
93-
"The model produced from this example can be ",
94-
"run on vLLM with dtype=torch.float16.",
93+
"The model produced from this example can be "
94+
"run on vLLM with dtype=torch.float16."
9595
)

experimental/mistral/README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Mistral-format model compression (experimental)
2+
3+
This folder contains tools for compressing Mistral-format models, like `mistralai/Devstral-Small-2505` and `mistralai/Magistral-Small-2506`.
4+
5+
## FP8 W8A8 Quantization
6+
7+
This script quantizes Mistral-format models to FP8. It is not for use with HuggingFace-format models.
8+
9+
### 1. Download the model
10+
11+
Download the model and save it to a new "FP8" folder. We use `mistralai/Magistral-Small-2506` as an example.
12+
13+
```bash
14+
huggingface-cli download mistralai/Magistral-Small-2506 --local-dir Magistral-Small-2506-FP8
15+
```
16+
17+
### 2. Clean up HuggingFace-specific files
18+
19+
Models from the Hub often include files for both the native Mistral format and the HuggingFace `transformers` format. This script works on the native format, so the `transformers` files should be removed to avoid confusion.
20+
21+
The HuggingFace-specific files are typically `config.json`, `model-000*-of-000*.safetensors`, and `model.safetensors.index.json`. The `params.json`, `tekken.json` and `consolidated.safetensors` are for the native format.
22+
23+
Before deleting, it's a good idea to look at the files in the directory to understand what you're removing.
24+
25+
Once you're ready, remove the `transformers`-specific files:
26+
27+
```bash
28+
rm Magistral-Small-2506/config.json Magistral-Small-2506/model.safetensors.index.json Magistral-Small-2506-FP8/model-000*
29+
```
30+
31+
### 3. Run the quantization script
32+
33+
Now, run the FP8 quantization script on the directory. This will modify the `.safetensors` files in-place and update `params.json` and `consolidated.safetensors`.
34+
35+
```bash
36+
python fp8_quantize.py Magistral-Small-2506-FP8
37+
```
38+
39+
### 4. Use the quantized model
40+
41+
The model should now be ready to use in vLLM!
42+
43+
```bash
44+
vllm serve Magistral-Small-2506-FP8 --tokenizer-mode mistral --config-format mistral --load-format mistral --tool-call-parser mistral --enable-auto-tool-choice
45+
```

experimental/mistral/fp8_quantize.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import argparse
2+
import os
3+
import json
4+
import torch
5+
import safetensors.torch
6+
7+
def per_tensor_quantize(tensor):
8+
"""Quantize a tensor to FP8 using per-tensor static scaling factor."""
9+
finfo = torch.finfo(torch.float8_e4m3fn)
10+
if tensor.numel() == 0:
11+
min_val, max_val = torch.tensor(-16.0, dtype=tensor.dtype), torch.tensor(16.0, dtype=tensor.dtype)
12+
else:
13+
min_val, max_val = tensor.aminmax()
14+
amax = torch.maximum(min_val.abs(), max_val.abs())
15+
scale = finfo.max / amax.clamp(min=1e-12)
16+
qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max).to(torch.float8_e4m3fn)
17+
scale = scale.float().reciprocal()
18+
return qweight, scale
19+
20+
def is_quantizable(name):
21+
"""Check if the tensor name indicates it can be quantized."""
22+
return name.startswith('layers.') and name.endswith(('.wk.weight', '.wo.weight', '.wq.weight', '.wv.weight', '.w1.weight', '.w2.weight', '.w3.weight'))
23+
24+
def process_safetensors_file(file_path):
25+
"""Process a single safetensors file in-place, quantizing weights to FP8."""
26+
print(f"Processing {file_path}")
27+
tensors = safetensors.torch.load_file(file_path)
28+
29+
modified_tensors = {}
30+
for name, tensor in tensors.items():
31+
if is_quantizable(name):
32+
print("Quantizing", name)
33+
qweight, scale = per_tensor_quantize(tensor)
34+
modified_tensors[name] = qweight
35+
modified_tensors[f"{name[:-len("weight")]}qscale_weight"] = scale
36+
else:
37+
modified_tensors[name] = tensor
38+
39+
safetensors.torch.save_file(modified_tensors, file_path)
40+
print(f"Updated {file_path} with quantized tensors")
41+
42+
def update_index_file(index_file_path):
43+
"""Update the index file for the quantized model."""
44+
print(f"Updating index file: {index_file_path}")
45+
with open(index_file_path, 'r') as f:
46+
index = json.load(f)
47+
48+
new_weight_map = {}
49+
for tensor_name, file_name in index['weight_map'].items():
50+
new_weight_map[tensor_name] = file_name
51+
if is_quantizable(tensor_name):
52+
new_weight_map[f"{tensor_name[:-len("weight")]}qscale_weight"] = file_name
53+
54+
index['weight_map'] = new_weight_map
55+
56+
# Recalculate total_size
57+
total_size = sum(os.path.getsize(os.path.join(os.path.dirname(index_file_path), file))
58+
for file in set(index['weight_map'].values()))
59+
index['metadata']['total_size'] = total_size
60+
61+
with open(index_file_path, 'w') as f:
62+
json.dump(index, f, indent=2)
63+
print(f"Updated index file {index_file_path}")
64+
65+
def update_config(config_file_path):
66+
"""Update the params.json file for the quantized model."""
67+
print(f"Updating config file: {config_file_path}")
68+
with open(config_file_path, 'r') as f:
69+
config = json.load(f)
70+
71+
config["quantization"] = {
72+
"config_groups": {
73+
"group_0": {
74+
"input_activations": {
75+
"dynamic": True,
76+
"num_bits": 8,
77+
"observer": None,
78+
"strategy": "token",
79+
"symmetric": True,
80+
"type": "float"
81+
},
82+
"targets": ["Linear"],
83+
"weights": {
84+
"dynamic": False,
85+
"num_bits": 8,
86+
"observer": "minmax",
87+
"strategy": "tensor",
88+
"symmetric": True,
89+
"type": "float"
90+
}
91+
}},
92+
"format": "float-quantized",
93+
"ignore": ["lm_head", "output"],
94+
"quant_method": "compressed-tensors",
95+
"quantization_status": "compressed"
96+
}
97+
98+
with open(config_file_path, 'w') as f:
99+
json.dump(config, f, indent=2)
100+
print(f"Updated config file {config_file_path}")
101+
102+
def process_directory(directory):
103+
"""Process all safetensors files in the given directory."""
104+
for filename in os.listdir(directory):
105+
file_path = os.path.join(directory, filename)
106+
if filename.endswith('.safetensors'):
107+
process_safetensors_file(file_path)
108+
elif filename == 'consolidated.safetensors.index.json':
109+
update_index_file(file_path)
110+
elif filename == 'params.json':
111+
update_config(file_path)
112+
else:
113+
print(f"Skipping unrecognized file: {filename}")
114+
115+
if __name__ == '__main__':
116+
parser = argparse.ArgumentParser(description='Convert mistral safetensors model to FP8 in-place.')
117+
parser.add_argument('directory', type=str, help='The directory containing the safetensors files and index file.')
118+
119+
args = parser.parse_args()
120+
process_directory(args.directory)

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ def localversion_func(version: ScmVersion) -> str:
122122
"pynvml",
123123
"pillow",
124124
(
125-
"compressed-tensors==0.9.4"
125+
"compressed-tensors==0.10.1"
126126
if BUILD_TYPE == "release"
127-
else "compressed-tensors>=0.10.1a2"
127+
else "compressed-tensors>=0.10.2a2"
128128
),
129129
],
130130
extras_require={

src/llmcompressor/pytorch/model_load/helpers.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,16 @@ def save_checkpoint(
4545
get_model_compressor, # avoid circular import
4646
)
4747

48+
# used for decompression
49+
# unfortunately, if skip_sparsity_compression_stats==True, sparsity stats
50+
# are computed twice. In the future, track sparsity from recipe or
51+
# share recipe between compression and decompression
52+
compressor = get_model_compressor(
53+
model=model,
54+
save_compressed=save_compressed,
55+
skip_sparsity_compression_stats=skip_sparsity_compression_stats,
56+
)
57+
4858
# saving the model also saves the recipe
4959
model.save_pretrained(
5060
save_path,
@@ -55,13 +65,8 @@ def save_checkpoint(
5565
if processor is not None:
5666
processor.save_pretrained(save_path)
5767

58-
# saving the model modifies the model strcuture
68+
# decompression: saving the model modifies the model strcuture
5969
# as this is only a checkpoint, decompress model to enable future training/oneshot
60-
compressor = get_model_compressor(
61-
model=model,
62-
save_compressed=save_compressed,
63-
skip_sparsity_compression_stats=skip_sparsity_compression_stats,
64-
)
6570
if compressor is not None:
6671
compressor.decompress_model(model)
6772

src/llmcompressor/transformers/finetune/session_mixin.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def save_model(
363363
self,
364364
output_dir: str,
365365
_internal_call: bool = False,
366-
skip_sparsity_compression_stats: Optional[bool] = False,
366+
skip_sparsity_compression_stats: Optional[bool] = True,
367367
):
368368
"""
369369
Override of the save_model function and expects it to exist in the parent.
@@ -388,6 +388,8 @@ def save_model(
388388
self.model.prepare_for_save() # TODO: move to finalize
389389

390390
# save checkpoint
391+
# note that skip_sparsity_compression_stats
392+
# is True by default to avoid high runtime cost
391393
self.save_state()
392394
if self.accelerator.is_main_process:
393395
processor = getattr(self, "processing_class", self.tokenizer)

0 commit comments

Comments
 (0)