Add the Quantizations Methods.

codewithdark-git · codewithdark-git · commit e15d94498ae3 · 2025-05-21T12:32:43.000+05:00
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,8 @@ __pycache__/
 
 upcoming.md
 examples/
+*.ipynb
+
 
 logs
 main.py
diff --git a/quantllm/quant/awq.py b/quantllm/quant/awq.py
@@ -77,8 +77,7 @@ def quantize(
                     setattr(self.model, name, quantized)
         
         return self.model
-    
-    def _collect_activation_stats(
+      def _collect_activation_stats(
         self,
         data: torch.Tensor,
         num_steps: int
@@ -94,23 +93,59 @@ def fn(module, input, output):
                         if name not in self.act_scales:
                             self.act_scales[name] = []
                         x = input[0].detach()
-                        scale = torch.max(torch.abs(x))
-                        self.act_scales[name].append(scale)
+                        # Handle both 2D and 3D inputs
+                        if len(x.shape) == 3:
+                            # For 3D input (batch_size, seq_len, hidden_size)
+                            scale = torch.max(torch.abs(x.view(-1, x.size(-1))))
+                        else:
+                            scale = torch.max(torch.abs(x))
+                        self.act_scales[name].append(scale.cpu())  # Move to CPU to save memory
                     return fn
                     
                 handles.append(
                     module.register_forward_hook(hook_fn(name))
                 )
         
-        # Run calibration
+        # Run calibration in smaller batches
         with torch.no_grad():
-            for _ in range(num_steps):
-                self.model(data)
+            batch_size = 2  # Small batch size to prevent OOM
+            for step in range(num_steps):
+                # Clear CUDA cache periodically
+                if step % 10 == 0:
+                    torch.cuda.empty_cache()
+                
+                # Process a small batch
+                start_idx = (step * batch_size) % len(data)
+                end_idx = min(start_idx + batch_size, len(data))
+                batch = data[start_idx:end_idx]
+                
+                # Move batch to appropriate device
+                device = next(self.model.parameters()).device
+                batch = batch.to(device)
+                
+                self.model(batch)
+                
+                # Move batch back to CPU to free GPU memory
+                batch = batch.cpu()
                 
         # Remove hooks
         for handle in handles:
             handle.remove()
             
+        # Move model to CPU temporarily to free GPU memory
+        self.model = self.model.cpu()
+        torch.cuda.empty_cache()
+        
+        # Process collected statistics on CPU
+        for name in self.act_scales:
+            scales = torch.stack(self.act_scales[name])
+            # Use 99.9th percentile for more robust statistics
+            self.act_scales[name] = torch.quantile(scales, 0.999)
+            
+        # Move model back to GPU
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = self.model.to(device)
+            
         # Process collected statistics
         for name in self.act_scales:
             scales = torch.stack(self.act_scales[name])
diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py
@@ -135,8 +135,7 @@ def fn(module, input, output):
             }
             
         return stats
-        
-    def _quantize_layer(
+          def _quantize_layer(
         self,
         layer: nn.Linear,
         stats: Optional[Dict[str, torch.Tensor]] = None
@@ -152,7 +151,7 @@ def _quantize_layer(
             config=QuantizationConfig(
                 bits=self.bits,
                 scheme="symmetric",
-                granularity="per-group" if self.group_size > 0 else "per-tensor",
+                granularity="per-channel" if self.group_size > 0 else "per-tensor",
                 calibration="minmax",
                 channel_wise=self.group_size > 0,
                 dtype=f"int{self.bits}",
diff --git a/quantllm/quant/gptq.py b/quantllm/quant/gptq.py
@@ -68,8 +68,7 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
                     setattr(self.model, name, quantized)
         
         return self.model
-    
-    def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
+      def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
         """Compute Hessian approximation for a layer."""
         device = next(layer.parameters()).device
         
@@ -79,6 +78,9 @@ def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor
         
         def hook_fn(module, input, output):
             x = input[0].detach()
+            # Reshape input if needed (batch_size * seq_len, hidden_size)
+            if len(x.shape) == 3:
+                x = x.view(-1, x.size(-1))
             with torch.no_grad():
                 # Accumulate x^T x for Hessian approximation
                 H.add_(torch.matmul(x.t(), x))
@@ -88,7 +90,11 @@ def hook_fn(module, input, output):
         
         # Run calibration data through model
         with torch.no_grad():
-            self.model(data)
+            # Process in smaller batches to save memory
+            batch_size = 4  # Adjust based on available memory
+            for i in range(0, len(data), batch_size):
+                batch = data[i:i+batch_size]
+                self.model(batch)
             
         # Remove hook
         handle.remove()
diff --git a/quantllm/utils/benchmark.py b/quantllm/utils/benchmark.py
@@ -0,0 +1,205 @@
+"""Benchmarking utilities for quantization methods."""
+
+import time
+import torch
+import pandas as pd
+from typing import Dict, List, Tuple
+from transformers import PreTrainedModel
+from quantllm.quant import (
+    GPTQQuantizer,
+    AWQQuantizer,
+    GGUFQuantizer
+)
+
+class QuantizationBenchmark:
+    """Benchmark different quantization methods."""
+    
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        calibration_data: torch.Tensor,
+        input_shape: Tuple[int, ...] = (1, 32),
+        num_inference_steps: int = 100,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    ):
+        self.model = model
+        self.calibration_data = calibration_data
+        self.input_shape = input_shape
+        self.num_inference_steps = num_inference_steps
+        self.device = device
+        self.results = {}
+        
+    def benchmark_quantizer(
+        self,
+        name: str,
+        quantizer_class,
+        quantizer_args: Dict
+    ) -> Dict[str, float]:
+        """Benchmark a specific quantizer."""
+        try:
+            # Initialize quantizer
+            quantizer = quantizer_class(model=self.model.clone(), **quantizer_args)
+            
+            # Measure quantization time
+            start_time = time.time()
+            quantized_model = quantizer.quantize(calibration_data=self.calibration_data)
+            quant_time = time.time() - start_time
+            
+            # Move to appropriate device
+            quantized_model = quantized_model.to(self.device)
+            
+            # Generate test input
+            test_input = torch.randint(
+                0, 1000,
+                self.input_shape,
+                device=self.device
+            )
+            
+            # Warmup
+            for _ in range(10):
+                with torch.no_grad():
+                    quantized_model(test_input)
+            torch.cuda.synchronize() if self.device == "cuda" else None
+            
+            # Measure inference latency
+            latencies = []
+            for _ in range(self.num_inference_steps):
+                start = time.perf_counter()
+                with torch.no_grad():
+                    quantized_model(test_input)
+                torch.cuda.synchronize() if self.device == "cuda" else None
+                latencies.append((time.perf_counter() - start) * 1000)  # Convert to ms
+                
+            latencies = torch.tensor(latencies)
+            
+            # Calculate memory usage
+            if self.device == "cuda":
+                memory_allocated = torch.cuda.memory_allocated() / (1024 * 1024)  # MB
+                peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
+            else:
+                memory_allocated = 0
+                peak_memory = 0
+                
+            # Calculate model size
+            model_size = sum(p.numel() * p.element_size() for p in quantized_model.parameters()) / (1024 * 1024)  # MB
+            
+            results = {
+                "quantization_time": quant_time,
+                "mean_latency": latencies.mean().item(),
+                "p95_latency": torch.quantile(latencies, 0.95).item(),
+                "min_latency": latencies.min().item(),
+                "max_latency": latencies.max().item(),
+                "memory_allocated": memory_allocated,
+                "peak_memory": peak_memory,
+                "model_size": model_size
+            }
+            
+            self.results[name] = results
+            return results
+            
+        except Exception as e:
+            print(f"Error benchmarking {name}: {str(e)}")
+            return {}
+            
+    def run_all_benchmarks(self) -> pd.DataFrame:
+        """Run benchmarks for all quantization methods."""
+        # Common config
+        config = {
+            "bits": 4,
+            "group_size": 128
+        }
+        
+        # GPTQ
+        self.benchmark_quantizer(
+            "GPTQ",
+            GPTQQuantizer,
+            {**config, "actorder": True, "use_triton": False}
+        )
+        
+        # AWQ
+        self.benchmark_quantizer(
+            "AWQ",
+            AWQQuantizer,
+            {**config, "zero_point": True}
+        )
+        
+        # GGUF
+        self.benchmark_quantizer(
+            "GGUF",
+            GGUFQuantizer,
+            {**config, "use_packed": True}
+        )
+        
+        # Convert results to DataFrame
+        df = pd.DataFrame.from_dict(self.results, orient='index')
+        
+        # Add compression ratio
+        original_size = sum(p.numel() * p.element_size() for p in self.model.parameters()) / (1024 * 1024)
+        df['compression_ratio'] = original_size / df['model_size']
+        
+        return df
+        
+    def print_report(self):
+        """Print a formatted benchmark report."""
+        df = self.run_all_benchmarks()
+        
+        print("\nQuantization Benchmark Results")
+        print("=" * 80)
+        
+        # Format metrics
+        metrics = {
+            'quantization_time': ('Quantization Time (s)', '{:.2f}'),
+            'mean_latency': ('Mean Inference Latency (ms)', '{:.2f}'),
+            'p95_latency': ('P95 Inference Latency (ms)', '{:.2f}'),
+            'memory_allocated': ('Memory Used (MB)', '{:.1f}'),
+            'model_size': ('Model Size (MB)', '{:.1f}'),
+            'compression_ratio': ('Compression Ratio', '{:.1f}x')
+        }
+        
+        for method in df.index:
+            print(f"\n{method}")
+            print("-" * 40)
+            for metric, (name, fmt) in metrics.items():
+                value = df.loc[method, metric]
+                print(f"{name:<30} {fmt.format(value)}")
+                
+    def plot_comparison(self, save_path: str = None):
+        """Generate comparison plots."""
+        try:
+            import matplotlib.pyplot as plt
+        except ImportError:
+            print("matplotlib is required for plotting")
+            return
+            
+        df = pd.DataFrame.from_dict(self.results, orient='index')
+        
+        # Create subplots
+        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+        fig.suptitle('Quantization Method Comparison')
+        
+        # Latency comparison
+        axes[0, 0].bar(df.index, df['mean_latency'])
+        axes[0, 0].set_title('Mean Inference Latency (ms)')
+        axes[0, 0].tick_params(axis='x', rotation=45)
+        
+        # Memory usage
+        axes[0, 1].bar(df.index, df['memory_allocated'])
+        axes[0, 1].set_title('Memory Usage (MB)')
+        axes[0, 1].tick_params(axis='x', rotation=45)
+        
+        # Model size
+        axes[1, 0].bar(df.index, df['model_size'])
+        axes[1, 0].set_title('Model Size (MB)')
+        axes[1, 0].tick_params(axis='x', rotation=45)
+        
+        # Quantization time
+        axes[1, 1].bar(df.index, df['quantization_time'])
+        axes[1, 1].set_title('Quantization Time (s)')
+        axes[1, 1].tick_params(axis='x', rotation=45)
+        
+        plt.tight_layout()
+        
+        if save_path:
+            plt.savefig(save_path)
+        else:
+            plt.show()
diff --git a/tests/quant/test_quantizers.py b/tests/quant/test_quantizers.py