codewithdark-git
diff --git a/‎docs/api_reference/quantization.rst‎
Lines changed: 136 additions & 0 deletions b/‎docs/api_reference/quantization.rst‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎quantllm/quant/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎quantllm/quant/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎quantllm/quant/awq.py‎
Lines changed: 199 additions & 0 deletions b/‎quantllm/quant/awq.py‎
Lines changed: 199 additions & 0 deletions
@@ -0,0 +1,136 @@
+Quantization Methods
+==================
+
+QuantLLM provides three primary methods for model quantization, each with its own advantages:
+
+1. GPTQ (Goyal-Pham-Tan-Quant)
+---------------------------------
+
+GPTQ offers Hessian-based quantization with activation ordering for high accuracy:
+
+.. code-block:: python
+
+    from quantllm.quant import GPTQQuantizer
+    
+    # Initialize quantizer
+    quantizer = GPTQQuantizer(
+        model=model,
+        bits=4,              # Quantization bits (2-8)
+        group_size=128,      # Size of quantization groups
+        actorder=True,       # Enable activation ordering
+        use_triton=True      # Use Triton kernels for acceleration
+    )
+    
+    # Quantize model
+    quantized_model = quantizer.quantize(calibration_data=calibration_data)
+
+2. AWQ (Activation-Aware Weight Quantization)
+-------------------------------------------
+
+AWQ adapts quantization based on activation patterns:
+
+.. code-block:: python
+
+    from quantllm.quant import AWQQuantizer
+    
+    quantizer = AWQQuantizer(
+        model=model,
+        bits=4,              # Quantization bits
+        group_size=128,      # Group size for quantization
+        zero_point=True,     # Enable zero point computation
+        version="v2"         # AWQ version
+    )
+    
+    # Quantize with activation statistics
+    quantized_model = quantizer.quantize(
+        calibration_data=calibration_data,
+        calibration_steps=100
+    )
+
+3. GGUF (GGML Universal Format)
+-----------------------------
+
+GGUF provides an efficient format with CTransformers integration:
+
+.. code-block:: python
+
+    from quantllm.quant import GGUFQuantizer
+    
+    quantizer = GGUFQuantizer(
+        model=model,
+        bits=4,              # Quantization bits
+        group_size=32,       # Group size
+        use_packed=True      # Enable weight packing
+    )
+    
+    # Quantize model
+    quantized_model = quantizer.quantize()
+    
+    # Export to GGUF format
+    quantizer.convert_to_gguf("model-q4.gguf")
+
+Choosing the Right Method
+------------------------
+
+- **GPTQ**: Best for highest accuracy with slightly slower quantization
+- **AWQ**: Best balance of speed and accuracy, good for general use
+- **GGUF**: Best for deployment and inference with CTransformers
+
+Resource Requirements
+------------------
+
++-------------+------------+-------------+------------+
+| Method      | Memory     | Speed       | Accuracy   |
++=============+============+=============+============+
+| GPTQ        | High       | Slow        | Highest    |
++-------------+------------+-------------+------------+
+| AWQ         | Medium     | Fast        | High       |
++-------------+------------+-------------+------------+
+| GGUF        | Low        | Very Fast   | Good       |
++-------------+------------+-------------+------------+
+
+Common Parameters
+---------------
+
+All quantizers support these common parameters:
+
+- **bits**: Number of quantization bits (2-8)
+- **group_size**: Size of quantization groups
+- **calibration_data**: Data used for computing statistics
+
+Example Workflow
+--------------
+
+Here's a complete example of quantizing a model:
+
+.. code-block:: python
+
+    import torch
+    from quantllm import Model, ModelConfig
+    from quantllm.quant import AWQQuantizer
+    
+    # 1. Load model
+    model_config = ModelConfig(model_name="facebook/opt-350m")
+    model = Model(model_config).get_model()
+    
+    # 2. Prepare calibration data
+    calibration_data = prepare_calibration_data()  # Your calibration data
+    
+    # 3. Initialize quantizer
+    quantizer = AWQQuantizer(
+        model=model,
+        bits=4,
+        group_size=128
+    )
+    
+    # 4. Quantize model
+    quantized_model = quantizer.quantize(
+        calibration_data=calibration_data,
+        calibration_steps=100
+    )
+    
+    # 5. Use the quantized model
+    inputs = tokenizer("Hello, world!", return_tensors="pt")
+    outputs = quantized_model(**inputs)
+
+For more detailed examples, see the `examples/quantization_examples.py` file in the repository.
@@ -0,0 +1,19 @@
+"""Quantization functionality for LLMs."""
+
+from .quantization_engine import (
+    QuantizationConfig,
+    QuantizedLinear,
+    QuantizationEngine
+)
+from .gptq import GPTQQuantizer
+from .awq import AWQQuantizer
+from .gguf import GGUFQuantizer
+
+__all__ = [
+    "QuantizationConfig",
+    "QuantizedLinear", 
+    "QuantizationEngine",
+    "GPTQQuantizer",
+    "AWQQuantizer",
+    "GGUFQuantizer"
+]
@@ -0,0 +1,199 @@
+"""AWQ (Activation-Aware Weight Quantization) implementation for LLM quantization."""
+
+import torch
+import torch.nn as nn 
+import numpy as np
+from typing import Optional, Dict, Any, List, Union, Tuple
+from transformers import PreTrainedModel
+from .quantization_engine import QuantizationConfig, QuantizedLinear
+
+class AWQQuantizer:
+    """AWQ quantization implementation."""
+    
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        bits: int = 4,
+        group_size: int = 128, 
+        zero_point: bool = True,
+        scale_dtype: str = "fp32",
+        version: str = "v2",
+        enable_mnn_kernel: bool = False
+    ):
+        self.model = model
+        self.bits = bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.scale_dtype = scale_dtype
+        self.version = version
+        self.enable_mnn_kernel = enable_mnn_kernel
+        
+        # Initialize activation statistics dictionaries
+        self.act_scales = {}
+        self.weight_scales = {}
+        
+    def quantize(
+        self,
+        calibration_data: Optional[torch.Tensor] = None,
+        calibration_steps: int = 100
+    ) -> PreTrainedModel:
+        """
+        Quantize model using AWQ algorithm.
+        
+        Args:
+            calibration_data: Data used for computing activation statistics
+            calibration_steps: Number of steps for calibration
+            
+        Returns:
+            Quantized model
+        """
+        if calibration_data is None:
+            raise ValueError("AWQ requires calibration data for quantization")
+        
+        # Prepare model for quantization
+        self.model.eval()
+        
+        # Collect activation statistics
+        self._collect_activation_stats(calibration_data, calibration_steps)
+        
+        # Convert linear layers to quantized versions 
+        for name, module in self.model.named_modules():
+            if isinstance(module, nn.Linear):
+                # Get activation scale for this layer
+                act_scale = self.act_scales.get(name, None)
+                if act_scale is None:
+                    continue
+                    
+                # Convert to quantized layer
+                quantized = self._quantize_layer(module, act_scale)
+                
+                # Replace layer in model
+                parent_name = '.'.join(name.split('.')[:-1])
+                child_name = name.split('.')[-1]
+                if parent_name:
+                    parent = self.model.get_submodule(parent_name)
+                    setattr(parent, child_name, quantized)
+                else:
+                    setattr(self.model, name, quantized)
+        
+        return self.model
+    
+    def _collect_activation_stats(
+        self,
+        data: torch.Tensor,
+        num_steps: int
+    ):
+        """Collect activation statistics for each layer."""
+        
+        # Register hooks for all linear layers
+        handles = []
+        for name, module in self.model.named_modules():
+            if isinstance(module, nn.Linear):
+                def hook_fn(name):
+                    def fn(module, input, output):
+                        if name not in self.act_scales:
+                            self.act_scales[name] = []
+                        x = input[0].detach()
+                        scale = torch.max(torch.abs(x))
+                        self.act_scales[name].append(scale)
+                    return fn
+                    
+                handles.append(
+                    module.register_forward_hook(hook_fn(name))
+                )
+        
+        # Run calibration
+        with torch.no_grad():
+            for _ in range(num_steps):
+                self.model(data)
+                
+        # Remove hooks
+        for handle in handles:
+            handle.remove()
+            
+        # Process collected statistics
+        for name in self.act_scales:
+            scales = torch.stack(self.act_scales[name])
+            # Use 99.9th percentile for more robust statistics
+            self.act_scales[name] = torch.quantile(scales, 0.999)
+            
+    def _quantize_layer(
+        self,
+        layer: nn.Linear,
+        act_scale: torch.Tensor
+    ) -> QuantizedLinear:
+        """Quantize a single layer using AWQ."""
+        device = next(layer.parameters()).device
+        
+        # Initialize quantized layer
+        quantized = QuantizedLinear(
+            layer.in_features,
+            layer.out_features,
+            bias=layer.bias is not None,
+            config=QuantizationConfig(
+                bits=self.bits,
+                scheme="symmetric",
+                granularity="per-channel" if self.group_size > 0 else "per-tensor",
+                calibration="minmax",
+                channel_wise=True,
+                dtype=f"int{self.bits}",
+                format="awq"
+            )
+        )
+        
+        # Copy bias if exists
+        if layer.bias is not None:
+            quantized.bias.data.copy_(layer.bias.data)
+        
+        # Get weight matrix
+        W = layer.weight.data.clone()
+        
+        # Scale weights by activation scale
+        W = W / act_scale.view(1, -1)
+        
+        # Compute quantization scales per group
+        if self.group_size > 0:
+            n_groups = W.shape[0] // self.group_size
+            W_groups = W.view(n_groups, self.group_size, -1)
+            
+            scales = []
+            zero_points = [] if self.zero_point else None
+            
+            for idx in range(n_groups):
+                group = W_groups[idx]
+                max_abs = torch.max(torch.abs(group))
+                scale = (2 ** (self.bits - 1) - 1) / max_abs
+                scales.append(scale)
+                
+                if self.zero_point:
+                    zero_point = -(torch.max(group) + torch.min(group)) / 2 * scale
+                    zero_points.append(zero_point)
+            
+            scales = torch.stack(scales)
+            if self.zero_point:
+                zero_points = torch.stack(zero_points)
+            else:
+                zero_points = torch.zeros_like(scales)
+        else:
+            max_abs = torch.max(torch.abs(W), dim=1)[0]
+            scales = (2 ** (self.bits - 1) - 1) / max_abs
+            if self.zero_point:
+                max_vals = torch.max(W, dim=1)[0]
+                min_vals = torch.min(W, dim=1)[0]
+                zero_points = -(max_vals + min_vals) / 2 * scales
+            else:
+                zero_points = torch.zeros_like(scales)
+                
+        # Quantize weights
+        W_quant = torch.round(W * scales.view(-1, 1) - zero_points.view(-1, 1))
+        
+        # Store quantized weights and parameters
+        quantized.weight_quantized.copy_(W_quant.to(torch.int8))
+        quantized.weight_scale.copy_(1.0 / scales)
+        quantized.weight_zero_point.copy_(zero_points)
+        
+        # Store additional AWQ-specific information
+        if hasattr(quantized, 'act_scale'):
+            quantized.act_scale.copy_(act_scale)
+        
+        return quantized