Add the Quantizations Methods.

codewithdark-git · codewithdark-git · commit 0d2b20ae7359 · 2025-05-21T14:08:39.000+05:00
diff --git a/quantllm/quant/gptq.py b/quantllm/quant/gptq.py
@@ -1,12 +1,12 @@
-"""GPTQ (Goyal-Pham-Tan-Quant) implementation for LLM quantization."""
+"""GPTQ (Goyal-Pham-Tan-Quant) implementation."""
 
-import math
 import gc
+import math
 import torch
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union
 from transformers import PreTrainedModel
-from .quantization_engine import QuantizationConfig, QuantizedLinear
+from .quantization_engine import QuantizationConfig, QuantizedLinear, DeviceManager
 
 class GPTQQuantizer:
     """GPTQ quantization implementation with memory-efficient processing."""
@@ -22,18 +22,22 @@ def __init__(
         percdamp: float = 0.01,
         sym: bool = True,
         batch_size: int = 4,
-        cpu_offload: bool = False
+        device: Optional[Union[str, torch.device]] = None
     ):
         self.model = model
         self.bits = bits
         self.group_size = group_size
         self.actorder = actorder
         self.allow_mixed_bits = allow_mixed_bits
-        self.use_triton = use_triton
+        self.use_triton = use_triton and torch.cuda.is_available()
         self.percdamp = percdamp
         self.sym = sym
         self.batch_size = batch_size
-        self.cpu_offload = cpu_offload
+        
+        # Initialize device manager
+        self.device_manager = DeviceManager(
+            torch.device(device) if device else None
+        )
         
         # Initialize H matrices for each layer
         self.H = {}
@@ -43,6 +47,7 @@ def _clear_memory(self):
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+            self.device_manager.sync()
             
     def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTrainedModel:
         """
@@ -56,24 +61,25 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
         """
         if calibration_data is None:
             raise ValueError("GPTQ requires calibration data for quantization")
-        
-        # Prepare model for quantization
+            
+        device = self.device_manager.primary_device
+        self.model.to(device)
         self.model.eval()
         
-        # Convert all linear layers to quantizable versions
+        # Convert calibration data to correct device
+        calibration_data = calibration_data.to(device)
+        
+        # Process layers
         for name, module in self.model.named_modules():
             if isinstance(module, nn.Linear):
                 print(f"Processing layer: {name}")
                 
-                # Compute Hessian approximation for this layer
-                self.H[name] = self._compute_hessian(module, calibration_data)
-                
-                # Move Hessian to CPU if offloading is enabled
-                if self.cpu_offload:
-                    self.H[name] = self.H[name].cpu()
+                # Ensure layer is on correct device
+                module.to(device)
                 
-                # Convert to quantized layer
-                quantized = self._quantize_layer(module, self.H[name].to(module.weight.device) if self.cpu_offload else self.H[name])
+                # Compute Hessian approximation
+                self.H[name] = self._compute_hessian(module, calibration_data)
+                quantized = self._quantize_layer(module, self.H[name])
                 
                 # Replace layer in model
                 parent_name = '.'.join(name.split('.')[:-1])
@@ -87,65 +93,57 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
                 # Clear memory after processing each layer
                 self._clear_memory()
                 
-                # Remove processed Hessian to free memory
+                # Remove processed Hessian
                 del self.H[name]
         
         return self.model
-        
+    
     def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
-        """Compute Hessian approximation for a layer with memory-efficient batch processing."""
-        device = next(layer.parameters()).device
-        
-        # Initialize accumulator on CPU if offloading is enabled
+        """Compute Hessian approximation for a layer with memory-efficient processing."""
+        device = self.device_manager.primary_device
         n = layer.in_features
-        H = torch.zeros((n, n), device='cpu' if self.cpu_offload else device)
+        H = torch.zeros((n, n), device=device)
         
         def hook_fn(module, input, output):
             x = input[0].detach()
-            # Reshape input if needed (batch_size * seq_len, hidden_size)
             if len(x.shape) == 3:
                 x = x.view(-1, x.size(-1))
             
             with torch.no_grad():
-                # Process in smaller chunks to save memory
-                chunk_size = 1024  # Adjust based on available memory
+                chunk_size = 1024
                 num_chunks = math.ceil(x.size(0) / chunk_size)
                 
                 for i in range(num_chunks):
                     chunk = x[i * chunk_size:(i + 1) * chunk_size]
-                    # Compute contribution to Hessian
-                    if self.cpu_offload:
-                        chunk_H = torch.matmul(chunk.t(), chunk).cpu()
-                    else:
-                        chunk_H = torch.matmul(chunk.t(), chunk)
+                    chunk_H = torch.matmul(chunk.t(), chunk)
                     H.add_(chunk_H)
                     
-                    # Clear intermediate tensors
                     del chunk_H
-                    if i % 10 == 0:  # Periodic memory cleanup
+                    if i % 10 == 0:
                         self._clear_memory()
         
         # Register forward hook
         handle = layer.register_forward_hook(hook_fn)
         
-        # Run calibration data through model in batches
+        # Process calibration data in batches
         with torch.no_grad():
             for i in range(0, len(data), self.batch_size):
-                batch = data[i:i+self.batch_size]
+                batch = data[i:i+self.batch_size].to(device)
                 self.model(batch)
                 
-                # Periodic memory cleanup
                 if i % (self.batch_size * 10) == 0:
                     self._clear_memory()
             
-        # Remove hook
         handle.remove()
-        
         return H
     
     def _quantize_layer(self, layer: nn.Linear, H: torch.Tensor) -> QuantizedLinear:
-        """Quantize a single layer using GPTQ."""
-        device = next(layer.parameters()).device
+        """Quantize a single layer using GPTQ with memory management."""
+        device = self.device_manager.primary_device
+        
+        # Ensure tensors are on the correct device
+        H = H.to(device)
+        W = layer.weight.data.to(device)
         
         # Initialize quantized layer
         quantized = QuantizedLinear(
@@ -155,65 +153,42 @@ def _quantize_layer(self, layer: nn.Linear, H: torch.Tensor) -> QuantizedLinear:
             config=QuantizationConfig(
                 bits=self.bits,
                 scheme="symmetric" if self.sym else "asymmetric",
-                granularity="per-tensor",
-                calibration="minmax",
-                channel_wise=False,
-                dtype=f"{'u' if not self.sym else ''}int{self.bits}",
-                format="gptq"
+                granularity="per-channel",
+                calibration="gptq"
             )
-        )
+        ).to(device)
         
-        # Copy bias if exists
         if layer.bias is not None:
             quantized.bias.data.copy_(layer.bias.data)
         
-        # Get weight matrix
-        W = layer.weight.data.clone()
-        
-        # Compute optimal scales and zero points
-        if self.group_size > 0:
-            n_groups = W.shape[0] // self.group_size
-            W_groups = W.view(n_groups, self.group_size, -1)
-            scales = []
-            zero_points = []
+        # Process in chunks to save memory
+        chunk_size = min(1024, layer.out_features)
+        for i in range(0, layer.out_features, chunk_size):
+            chunk_end = min(i + chunk_size, layer.out_features)
+            W_chunk = W[i:chunk_end]
             
-            for idx in range(n_groups):
-                group = W_groups[idx]
-                if self.sym:
-                    scale = (2 ** (self.bits - 1) - 1) / torch.max(torch.abs(group))
-                    zero_point = 0
-                else:
-                    min_val = torch.min(group)
-                    max_val = torch.max(group)
-                    scale = (2 ** self.bits - 1) / (max_val - min_val)
-                    zero_point = -min_val * scale
-                
-                scales.append(scale)
-                zero_points.append(zero_point)
-            
-            scales = torch.stack(scales)
-            zero_points = torch.stack(zero_points)
-        else:
+            # Compute optimal scaling factors for this chunk
             if self.sym:
-                scales = (2 ** (self.bits - 1) - 1) / torch.max(torch.abs(W), dim=1)[0]
-                zero_points = torch.zeros_like(scales)
+                max_val = W_chunk.abs().max(dim=1)[0]
+                scale = (2 ** (self.bits - 1) - 1) / max_val
             else:
-                min_vals = torch.min(W, dim=1)[0]
-                max_vals = torch.max(W, dim=1)[0]
-                scales = (2 ** self.bits - 1) / (max_vals - min_vals)
-                zero_points = -min_vals * scales
-        
-        # Quantize weights
-        W_quant = torch.round(W * scales.view(-1, 1) - zero_points.view(-1, 1))
-        
-        # Apply GPTQ optimization
-        recon_loss = torch.sum((W - (W_quant + zero_points.view(-1, 1)) / scales.view(-1, 1)).pow(2))
-        if H is not None:
-            recon_loss = recon_loss * torch.trace(H)
-        
-        # Store quantized weights and parameters
-        quantized.weight_quantized.copy_(W_quant.to(torch.int8))
-        quantized.weight_scale.copy_(1.0 / scales)
-        quantized.weight_zero_point.copy_(zero_points)
+                min_val = W_chunk.min(dim=1)[0]
+                max_val = W_chunk.max(dim=1)[0]
+                scale = (2 ** self.bits - 1) / (max_val - min_val)
+            
+            # Quantize chunk
+            W_quant = torch.round(W_chunk * scale.unsqueeze(1))
+            W_quant = torch.clamp(
+                W_quant,
+                -(2 ** (self.bits - 1)),
+                2 ** (self.bits - 1) - 1
+            )
+            
+            # Store quantized weights and scale
+            quantized.weight_quantized.data[i:chunk_end] = W_quant.to(torch.int8)
+            quantized.weight_scale.data[i:chunk_end] = 1.0 / scale
+            
+            del W_chunk, W_quant
+            self._clear_memory()
         
         return quantized
diff --git a/quantllm/quant/quantization_engine.py b/quantllm/quant/quantization_engine.py
@@ -7,6 +7,62 @@
 import numpy as np
 from ..trainer.logger import TrainingLogger
 
+def get_device_map(model: PreTrainedModel) -> Dict[str, torch.device]:
+    """Get device mapping for model parameters."""
+    device_map = {}
+    for name, param in model.named_parameters():
+        device_map[name] = param.device
+    return device_map
+
+def move_to_device(
+    tensor: torch.Tensor,
+    device: torch.device,
+    force_copy: bool = False
+) -> torch.Tensor:
+    """Safely move tensor to device with proper error handling."""
+    try:
+        if force_copy:
+            return tensor.to(device, copy=True)
+        if tensor.device == device:
+            return tensor
+        return tensor.to(device)
+    except Exception as e:
+        raise RuntimeError(f"Failed to move tensor to {device}: {str(e)}")
+
+class DeviceManager:
+    """Manage device placement and synchronization."""
+    
+    def __init__(self, primary_device: Optional[torch.device] = None):
+        self.primary_device = primary_device or self._get_default_device()
+        self.device_maps = {}
+    
+    def _get_default_device(self) -> torch.device:
+        """Get the best available device."""
+        if torch.cuda.is_available():
+            # Automatically select GPU with most free memory
+            max_free = 0
+            best_device = 0
+            for i in range(torch.cuda.device_count()):
+                free_mem = torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_allocated(i)
+                if free_mem > max_free:
+                    max_free = free_mem
+                    best_device = i
+            return torch.device(f'cuda:{best_device}')
+        return torch.device('cpu')
+    
+    def sync(self):
+        """Synchronize all CUDA devices."""
+        if torch.cuda.is_available():
+            for i in range(torch.cuda.device_count()):
+                torch.cuda.synchronize(i)
+                
+    def ensure_same_device(self, *tensors: torch.Tensor) -> List[torch.Tensor]:
+        """Ensure all tensors are on the same device."""
+        if not tensors:
+            return []
+        target_device = tensors[0].device
+        return [move_to_device(t, target_device) for t in tensors]
+
 class QuantizationConfig:
     """Configuration for quantization parameters."""
     
@@ -108,10 +164,14 @@ class QuantizationEngine:
     def __init__(
         self,
         config: QuantizationConfig,
-        logger: Optional[TrainingLogger] = None
+        logger: Optional[TrainingLogger] = None,
+        device: Optional[Union[str, torch.device]] = None
     ):
         self.config = config
         self.logger = logger or TrainingLogger()
+        self.device_manager = DeviceManager(
+            torch.device(device) if device else None
+        )
         
     def quantize_model(
         self,
diff --git a/quantllm/utils/benchmark.py b/quantllm/utils/benchmark.py