vllm-project
diff --git a/‎examples/quantization_w4a16/llama3_imatrix_example.py‎
Lines changed: 74 additions & 0 deletions b/‎examples/quantization_w4a16/llama3_imatrix_example.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/transform/imatrix/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/llmcompressor/modifiers/transform/imatrix/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/transform/imatrix/base.py‎
Lines changed: 247 additions & 0 deletions b/‎src/llmcompressor/modifiers/transform/imatrix/base.py‎
Lines changed: 247 additions & 0 deletions
diff --git a/‎src/llmcompressor/observers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/llmcompressor/observers/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,74 @@
+from compressed_tensors.offload import dispatch_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
+
+# Select model and load it.
+model_id = "meta-llama/Meta-Llama-3.1-8B"
+
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+DATASET_ID = "open_platypus"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Configure the quantization algorithm to run.
+#   * collect per-channel importance statistics (E[x²]) with IMatrixGatherer
+#   * quantize the weights to 4 bit with group size 128
+#   * use imatrix_mse observer to weight quantization error by channel importance
+recipe = [
+    IMatrixGatherer(ignore=["lm_head"]),
+    QuantizationModifier(
+        config_groups={
+            "group_0": {
+                "targets": ["Linear"],
+                "weights": {
+                    "num_bits": 4,
+                    "type": "int",
+                    "symmetric": True,
+                    "strategy": "group",
+                    "group_size": 128,
+                    "observer": "imatrix_mse",
+                    "observer_kwargs": {
+                        "norm": 2.4,
+                        "maxshrink": 0.20,
+                        "grid": 20,
+                    },
+                },
+            }
+        },
+        ignore=["lm_head"],
+    ),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=DATASET_ID,
+    splits={"calibration": "train[:5%]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-imatrix"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -0,0 +1,3 @@
+# ruff: noqa
+
+from .base import *
@@ -0,0 +1,247 @@
+"""
+IMatrixGatherer: collects importance-weighted activation statistics
+(E[x²] per input channel) on Linear modules via forward pre-hooks.
+
+Stores ``module._imatrix_importance`` as a 1-D float32 tensor of
+shape ``(in_features,)`` on each target module.  Does **not** quantize
+or modify weights in any way.
+
+The downstream ``imatrix_mse`` observer reads this attribute during
+its grid search to weight quantization error by channel importance.
+
+Example recipe::
+
+    recipe:
+      - IMatrixGatherer:
+          ignore: ["lm_head"]
+      - QuantizationModifier:
+          config_groups:
+            group_0:
+              targets: ["Linear"]
+              weights:
+                observer: imatrix_mse
+
+Or composed with AWQ and GPTQ::
+
+    recipe:
+      - AWQModifier(...)
+      - IMatrixGatherer:
+          ignore: ["lm_head"]
+      - GPTQModifier:
+          config_groups:
+            group_0:
+              targets: ["Linear"]
+              weights:
+                observer: imatrix_mse
+
+.. note::
+    Auto-prepend (inserting the gatherer automatically when
+    ``imatrix_mse`` is detected in a recipe) is planned for a
+    follow-up PR.
+
+.. note::
+    Unlike AWQModifier, this gatherer does not use IntermediatesCache
+    because it only stores a single accumulated 1-D tensor per layer
+    (not full batch activations).  A simple CPU offload at
+    CALIBRATION_EPOCH_END is sufficient.
+"""
+
+from typing import Dict, List, Optional, Union
+
+import torch
+from compressed_tensors.utils import match_named_modules
+from loguru import logger
+from pydantic import Field
+from torch.nn import Module
+
+from llmcompressor.core import Event, EventType, State
+from llmcompressor.modifiers import Modifier
+
+__all__ = ["IMatrixGatherer"]
+
+
+class IMatrixGatherer(Modifier):
+    """
+    Collects importance-weighted activation statistics (E[x²])
+    for each targeted module via forward pre-hooks.
+
+    Stores ``module._imatrix_importance`` as a 1-D float32 tensor
+    of shape ``(in_features,)`` on each target module.
+
+    Does NOT quantize.  Does NOT modify weights.
+
+    Statistics are kept on GPU during calibration for speed, then
+    offloaded to CPU at CALIBRATION_EPOCH_END to free GPU memory
+    before quantization begins.
+
+    :param ignore: layer name patterns to skip (default: ``["lm_head"]``)
+    :param targets: module types to instrument (default: ``["Linear"]``)
+    """
+
+    ignore: Union[str, List[str]] = Field(
+        default_factory=lambda: ["lm_head"],
+    )
+    targets: Union[str, List[str]] = Field(
+        default_factory=lambda: ["Linear"],
+    )
+
+    # -- internal state (excluded from serialisation) --
+    _target_names: Optional[List[str]] = None
+    _sums: Optional[Dict[str, torch.Tensor]] = None
+    _counts: Optional[Dict[str, int]] = None
+
+    # ------------------------------------------------------------------ #
+    #  Lifecycle
+    # ------------------------------------------------------------------ #
+
+    def on_initialize(self, state: State, **kwargs) -> bool:
+        if self.end and self.end != -1:
+            raise ValueError(
+                f"{self.__class__.__name__} can only be applied "
+                f"during one-shot.  Expected end to be None or "
+                f"-1, got {self.end}"
+            )
+        if self.start and self.start != -1:
+            raise ValueError(
+                f"{self.__class__.__name__} can only be applied "
+                f"during one-shot.  Expected start to be None "
+                f"or -1, got {self.start}"
+            )
+
+        self._resolve_targets(state.model)
+        return True
+
+    def on_start(self, state: State, event: Event, **kwargs):
+        self.started_ = True
+        self._register_accumulation_hooks(state.model)
+
+    def on_event(self, state: State, event: Event, **kwargs):
+        if event.type_ == EventType.CALIBRATION_EPOCH_START:
+            if not self.started_:
+                self.on_start(state, None)
+
+        if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
+            self._compute_and_attach(state.model)
+
+        if event.type_ == EventType.CALIBRATION_EPOCH_END:
+            self._compute_and_attach(state.model, offload_to_cpu=True)
+
+            if not self.ended_:
+                self.on_end(state, None)
+
+    def on_end(self, state: State, event: Event, **kwargs):
+        self.ended_ = True
+        self.remove_hooks()
+
+    def on_finalize(self, state: State, **kwargs) -> bool:
+        if not self.ended_:
+            self.on_end(state, None)
+
+        self._sums = None
+        self._counts = None
+        self._target_names = None
+        return True
+
+    # ------------------------------------------------------------------ #
+    #  Target resolution
+    # ------------------------------------------------------------------ #
+
+    def _resolve_targets(self, model: Module):
+        """Identify target modules using compressed_tensors matching."""
+        self._target_names = []
+        self._sums = {}
+        self._counts = {}
+
+        for name, module in match_named_modules(model, self.targets, self.ignore):
+            if not hasattr(module, "in_features"):
+                continue
+
+            self._target_names.append(name)
+            self._sums[name] = torch.zeros(module.in_features, dtype=torch.float32)
+            self._counts[name] = 0
+
+        logger.info(f"IMatrixGatherer: targeting {len(self._target_names)}" f" modules")
+
+    # ------------------------------------------------------------------ #
+    #  Hook registration
+    # ------------------------------------------------------------------ #
+
+    def _register_accumulation_hooks(self, model: Module):
+        """Attach a forward-pre hook to every target module."""
+
+        def _create_hook_fn(layer_name: str):
+            """Closure captures layer_name."""
+
+            def _hook(module: Module, args):
+                x = args[0] if not isinstance(args, torch.Tensor) else args
+                if isinstance(x, tuple):
+                    x = x[0]
+                if not isinstance(x, torch.Tensor):
+                    return
+
+                # Per-token accumulation
+                x_f = x.detach().float()
+                n_tokens = x_f[..., 0].numel()
+                token_sum = x_f.pow(2).sum(dim=list(range(x_f.dim() - 1)))
+
+                device = self._sums[layer_name].device
+                if device != token_sum.device:
+                    self._sums[layer_name] = self._sums[layer_name].to(token_sum.device)
+
+                self._sums[layer_name].add_(token_sum)
+                self._counts[layer_name] += n_tokens
+
+            return _hook
+
+        for name, module in match_named_modules(model, self.targets, self.ignore):
+            if name in self._sums:
+                self.register_hook(
+                    module,
+                    _create_hook_fn(name),
+                    "forward_pre",
+                )
+
+    # ------------------------------------------------------------------ #
+    #  Compute & attach
+    # ------------------------------------------------------------------ #
+
+    def _compute_and_attach(self, model: Module, offload_to_cpu: bool = False):
+        """
+        Compute E[x²] and store on each module.
+
+        :param model: model whose modules receive importance data
+        :param offload_to_cpu: if True, move importance tensors to CPU
+            after attaching.  Set at CALIBRATION_EPOCH_END to free
+            GPU memory before quantization.
+        """
+        attached = 0
+        for name, module in match_named_modules(model, self.targets, self.ignore):
+            if name not in self._sums:
+                continue
+
+            count = self._counts[name]
+            if count == 0:
+                continue
+
+            importance = self._sums[name] / count
+
+            if offload_to_cpu:
+                importance = importance.to("cpu")
+                # also free the accumulator
+                del self._sums[name]
+
+            module._imatrix_importance = importance
+
+            attached += 1
+            logger.debug(
+                f"iMatrix {name}: "
+                f"mean={importance.mean():.4f}, "
+                f"max={importance.max():.4f}, "
+                f"ratio="
+                f"{importance.max() / (importance.mean() + 1e-10):.1f}"
+            )
+
+        logger.info(
+            f"IMatrixGatherer: attached importance to "
+            f"{attached} modules" + (" (offloaded to CPU)" if offload_to_cpu else "")
+        )
@@ -14,3 +14,4 @@
 from .moving_base import *
 from .min_max import *
 from .mse import *
+from .imatrix import *
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# ruff: noqa`
	`2`	`+`
	`3`	`+from .base import *`