Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading

kylesayrs · kylesayrs · commit 6d942cc3a167 · 2025-06-12T16:16:50.000-04:00
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: Red Hat AI
+  - name: vLLM Project
+title: "LLM Compressor"
+date-released: 2024-08-08
+url: https://github.com/vllm-project/llm-compressor
diff --git a/README.md b/README.md
@@ -120,3 +120,17 @@ output = model.generate("My name is")
 
 - If you have any questions or requests open an [issue](https://github.com/vllm-project/llm-compressor/issues) and we will add an example or documentation.
 - We appreciate contributions to the code, examples, integrations, and documentation as well as bug reports and feature requests! [Learn how here](CONTRIBUTING.md).
+
+## Citation
+
+If you find LLM Compressor useful in your research or projects, please consider citing it:
+
+```bibtex
+@software{llmcompressor2024,
+    title={{LLM Compressor}},
+    author={Red Hat AI and vLLM Project},
+    year={2024},
+    month={8},
+    url={https://github.com/vllm-project/llm-compressor},
+}
+```
diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -88,8 +88,8 @@
 tokenizer.save_pretrained(f"{output_dir}/quantization_stage")
 
 logger.info(
-    "llmcompressor does not currently support running ",
+    "llmcompressor does not currently support running "
     "compressed models in the marlin24 format. "
-    "The model produced from this example can be ",
-    "run on vLLM with dtype=torch.float16.",
+    "The model produced from this example can be "
+    "run on vLLM with dtype=torch.float16."
 )
diff --git a/experimental/mistral/README.md b/experimental/mistral/README.md
@@ -0,0 +1,45 @@
+# Mistral-format model compression (experimental)
+
+This folder contains tools for compressing Mistral-format models, like `mistralai/Devstral-Small-2505` and `mistralai/Magistral-Small-2506`.
+
+## FP8 W8A8 Quantization
+
+This script quantizes Mistral-format models to FP8. It is not for use with HuggingFace-format models.
+
+### 1. Download the model
+
+Download the model and save it to a new "FP8" folder. We use `mistralai/Magistral-Small-2506` as an example.
+
+```bash
+huggingface-cli download mistralai/Magistral-Small-2506 --local-dir Magistral-Small-2506-FP8
+```
+
+### 2. Clean up HuggingFace-specific files
+
+Models from the Hub often include files for both the native Mistral format and the HuggingFace `transformers` format. This script works on the native format, so the `transformers` files should be removed to avoid confusion.
+
+The HuggingFace-specific files are typically `config.json`, `model-000*-of-000*.safetensors`, and `model.safetensors.index.json`. The `params.json`, `tekken.json` and `consolidated.safetensors` are for the native format.
+
+Before deleting, it's a good idea to look at the files in the directory to understand what you're removing.
+
+Once you're ready, remove the `transformers`-specific files:
+
+```bash
+rm Magistral-Small-2506/config.json Magistral-Small-2506/model.safetensors.index.json Magistral-Small-2506-FP8/model-000*
+```
+
+### 3. Run the quantization script
+
+Now, run the FP8 quantization script on the directory. This will modify the `.safetensors` files in-place and update `params.json` and `consolidated.safetensors`.
+
+```bash
+python fp8_quantize.py Magistral-Small-2506-FP8
+```
+
+### 4. Use the quantized model
+
+The model should now be ready to use in vLLM!
+
+```bash
+vllm serve Magistral-Small-2506-FP8 --tokenizer-mode mistral --config-format mistral --load-format mistral --tool-call-parser mistral --enable-auto-tool-choice
+```
diff --git a/experimental/mistral/fp8_quantize.py b/experimental/mistral/fp8_quantize.py
@@ -0,0 +1,120 @@
+import argparse
+import os
+import json
+import torch
+import safetensors.torch
+
+def per_tensor_quantize(tensor):
+    """Quantize a tensor to FP8 using per-tensor static scaling factor."""
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    if tensor.numel() == 0:
+        min_val, max_val = torch.tensor(-16.0, dtype=tensor.dtype), torch.tensor(16.0, dtype=tensor.dtype)
+    else:
+        min_val, max_val = tensor.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max).to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+def is_quantizable(name):
+    """Check if the tensor name indicates it can be quantized."""
+    return name.startswith('layers.') and name.endswith(('.wk.weight', '.wo.weight', '.wq.weight', '.wv.weight', '.w1.weight', '.w2.weight', '.w3.weight'))
+
+def process_safetensors_file(file_path):
+    """Process a single safetensors file in-place, quantizing weights to FP8."""
+    print(f"Processing {file_path}")
+    tensors = safetensors.torch.load_file(file_path)
+    
+    modified_tensors = {}
+    for name, tensor in tensors.items():
+        if is_quantizable(name):
+            print("Quantizing", name)
+            qweight, scale = per_tensor_quantize(tensor)
+            modified_tensors[name] = qweight
+            modified_tensors[f"{name[:-len("weight")]}qscale_weight"] = scale
+        else:
+            modified_tensors[name] = tensor
+
+    safetensors.torch.save_file(modified_tensors, file_path)
+    print(f"Updated {file_path} with quantized tensors")
+
+def update_index_file(index_file_path):
+    """Update the index file for the quantized model."""
+    print(f"Updating index file: {index_file_path}")
+    with open(index_file_path, 'r') as f:
+        index = json.load(f)
+    
+    new_weight_map = {}
+    for tensor_name, file_name in index['weight_map'].items():
+        new_weight_map[tensor_name] = file_name
+        if is_quantizable(tensor_name):
+            new_weight_map[f"{tensor_name[:-len("weight")]}qscale_weight"] = file_name
+    
+    index['weight_map'] = new_weight_map
+    
+    # Recalculate total_size
+    total_size = sum(os.path.getsize(os.path.join(os.path.dirname(index_file_path), file)) 
+                     for file in set(index['weight_map'].values()))
+    index['metadata']['total_size'] = total_size
+    
+    with open(index_file_path, 'w') as f:
+        json.dump(index, f, indent=2)
+    print(f"Updated index file {index_file_path}")
+
+def update_config(config_file_path):
+    """Update the params.json file for the quantized model."""
+    print(f"Updating config file: {config_file_path}")
+    with open(config_file_path, 'r') as f:
+        config = json.load(f)
+
+    config["quantization"] = {
+        "config_groups": {
+        "group_0": {
+            "input_activations": {
+                "dynamic": True,
+                "num_bits": 8,
+                "observer": None,
+                "strategy": "token",
+                "symmetric": True,
+                "type": "float"
+            },
+            "targets": ["Linear"],
+            "weights": {
+                "dynamic": False,
+                "num_bits": 8,
+                "observer": "minmax",
+                "strategy": "tensor",
+                "symmetric": True,
+                "type": "float"
+            }
+        }},
+        "format": "float-quantized",
+        "ignore": ["lm_head", "output"],
+        "quant_method": "compressed-tensors",
+        "quantization_status": "compressed"
+    }
+    
+    with open(config_file_path, 'w') as f:
+        json.dump(config, f, indent=2)
+    print(f"Updated config file {config_file_path}")
+
+def process_directory(directory):
+    """Process all safetensors files in the given directory."""
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if filename.endswith('.safetensors'):
+            process_safetensors_file(file_path)
+        elif filename == 'consolidated.safetensors.index.json':
+            update_index_file(file_path)
+        elif filename == 'params.json':
+            update_config(file_path)
+        else:
+            print(f"Skipping unrecognized file: {filename}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert mistral safetensors model to FP8 in-place.')
+    parser.add_argument('directory', type=str, help='The directory containing the safetensors files and index file.')
+    
+    args = parser.parse_args()
+    process_directory(args.directory)
diff --git a/setup.py b/setup.py
@@ -122,9 +122,9 @@ def localversion_func(version: ScmVersion) -> str:
         "pynvml",
         "pillow",
         (
-            "compressed-tensors==0.9.4"
+            "compressed-tensors==0.10.1"
             if BUILD_TYPE == "release"
-            else "compressed-tensors>=0.10.1a2"
+            else "compressed-tensors>=0.10.2a2"
         ),
     ],
     extras_require={
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -45,6 +45,16 @@ def save_checkpoint(
         get_model_compressor,  # avoid circular import
     )
 
+    # used for decompression
+    # unfortunately, if skip_sparsity_compression_stats==True, sparsity stats
+    # are computed twice. In the future, track sparsity from recipe or
+    # share recipe between compression and decompression
+    compressor = get_model_compressor(
+        model=model,
+        save_compressed=save_compressed,
+        skip_sparsity_compression_stats=skip_sparsity_compression_stats,
+    )
+
     # saving the model also saves the recipe
     model.save_pretrained(
         save_path,
@@ -55,13 +65,8 @@ def save_checkpoint(
     if processor is not None:
         processor.save_pretrained(save_path)
 
-    # saving the model modifies the model strcuture
+    # decompression: saving the model modifies the model strcuture
     # as this is only a checkpoint, decompress model to enable future training/oneshot
-    compressor = get_model_compressor(
-        model=model,
-        save_compressed=save_compressed,
-        skip_sparsity_compression_stats=skip_sparsity_compression_stats,
-    )
     if compressor is not None:
         compressor.decompress_model(model)
 
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -363,7 +363,7 @@ def save_model(
         self,
         output_dir: str,
         _internal_call: bool = False,
-        skip_sparsity_compression_stats: Optional[bool] = False,
+        skip_sparsity_compression_stats: Optional[bool] = True,
     ):
         """
         Override of the save_model function and expects it to exist in the parent.
@@ -388,6 +388,8 @@ def save_model(
             self.model.prepare_for_save()  # TODO: move to finalize
 
         # save checkpoint
+        # note that skip_sparsity_compression_stats
+        # is True by default to avoid high runtime cost
         self.save_state()
         if self.accelerator.is_main_process:
             processor = getattr(self, "processing_class", self.tokenizer)
diff --git a/tests/e2e/vLLM/configs/fp4_nvfp4a16.yaml b/tests/e2e/vLLM/configs/fp4_nvfp4a16.yaml