codewithdark-git
diff --git a/‎quantllm/quant/awq.py‎
Lines changed: 24 additions & 50 deletions b/‎quantllm/quant/awq.py‎
Lines changed: 24 additions & 50 deletions
diff --git a/‎quantllm/quant/gguf.py‎
Lines changed: 13 additions & 36 deletions b/‎quantllm/quant/gguf.py‎
Lines changed: 13 additions & 36 deletions
diff --git a/‎quantllm/quant/gptq.py‎
Lines changed: 6 additions & 20 deletions b/‎quantllm/quant/gptq.py‎
Lines changed: 6 additions & 20 deletions
@@ -1,15 +1,14 @@
-"""AWQ (Activation-Aware Weight Quantization) implementation with memory-efficient processing."""
+"""AWQ (Activation-Aware Weight Quantization) implementation."""
 
 import gc
 import torch
 import torch.nn as nn 
 import numpy as np
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import QuantizationConfig, QuantizedLinear
+from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
 
-class AWQQuantizer:
-    """AWQ quantization implementation with memory-efficient processing."""
+class AWQQuantizer(BaseQuantizer):
     """AWQ quantization implementation with memory-efficient processing."""
 
     def __init__(
@@ -21,24 +20,21 @@ def __init__(
         scale_dtype: str = "fp32",
         version: str = "v2",
         enable_mnn_kernel: bool = False,
-        batch_size: int = 2,  # Small batch size for memory efficiency
-        cpu_offload: bool = True  # Enable CPU offloading
+        batch_size: int = 2,
+        device: Optional[Union[str, torch.device]] = None
     ):
-        self.model = model
-        self.bits = bits
+        super().__init__(model=model, bits=bits, device=device)
         self.group_size = group_size
         self.zero_point = zero_point
         self.scale_dtype = scale_dtype
         self.version = version
         self.enable_mnn_kernel = enable_mnn_kernel
+        self.batch_size = batch_size
 
         # Initialize activation statistics dictionaries
         self.act_scales = {}
         self.weight_scales = {}
 
-        self.batch_size = batch_size
-        self.cpu_offload = cpu_offload
-        
     def _clear_memory(self):
         """Clear GPU memory and run garbage collection."""
         if torch.cuda.is_available():
@@ -54,8 +50,8 @@ def quantize(
         if calibration_data is None:
             raise ValueError("AWQ requires calibration data for quantization")
 
-        # Keep model on CPU initially
-        self.model.cpu()
+        # Prepare calibration data
+        calibration_data = self.prepare_calibration_data(calibration_data)
         self.model.eval()
 
         # Process calibration data in batches
@@ -68,21 +64,9 @@ def quantize(
             end_idx = min(step + self.batch_size, total_steps)
             batch = calibration_data[step:end_idx]
 
-            # Move batch to appropriate device
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            batch = batch.to(device)
-            
-            # Temporarily move model to device
-            if device.type == "cuda":
-                self.model = self.model.cuda()
-            
             # Collect statistics for this batch
             self._collect_activation_stats(batch)
 
-            # Move model back to CPU if offloading enabled
-            if self.cpu_offload and device.type == "cuda":
-                self.model = self.model.cpu()
-            
             # Clean up batch
             del batch
             self._clear_memory()
@@ -93,35 +77,25 @@ def quantize(
         # Quantize the model layer by layer
         for name, module in self.model.named_modules():
             if isinstance(module, nn.Linear):
-                # Move layer to device temporarily for quantization
-                if device.type == "cuda":
-                    module = module.cuda()
-                    
+                self.logger.info(f"Processing layer: {name}")
+                
                 # Get activation scale for this layer
                 act_scale = self.act_scales.get(name)
-                if act_scale is not None:
-                    # Quantize layer
-                    quantized = self._quantize_layer(module, act_scale)
-                    
-                    # Move quantized layer back to CPU if offloading
-                    if self.cpu_offload:
-                        quantized = quantized.cpu()
-                        
-                    # Replace layer in model
-                    parent_name = '.'.join(name.split('.')[:-1])
-                    child_name = name.split('.')[-1]
-                    
-                    if parent_name:
-                        parent = self.model.get_submodule(parent_name)
-                        setattr(parent, child_name, quantized)
-                    else:
-                        setattr(self.model, name, quantized)
-                        
-                # Clean up
-                self._clear_memory()
+                quantized = self._quantize_layer(module, act_scale)
 
+                # Replace layer in model
+                parent_name = '.'.join(name.split('.')[:-1])
+                child_name = name.split('.')[-1]
+                if parent_name:
+                    parent = self.model.get_submodule(parent_name)
+                    setattr(parent, child_name, quantized)
+                else:
+                    setattr(self.model, name, quantized)
+                
+                self._clear_memory()
+        
         return self.model
-      def _collect_activation_stats(
+    def _collect_activation_stats(
         self,
         data: torch.Tensor,
         num_steps: int
 
@@ -6,16 +6,16 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import QuantizationConfig, QuantizedLinear
+from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 try:
     import ctransformers
     CT_AVAILABLE = True
 except ImportError:
     CT_AVAILABLE = False
 
-class GGUFQuantizer:
-    """GGUF quantization implementation with CTransformers integration and memory-efficient processing."""
+class GGUFQuantizer(BaseQuantizer):
+    """GGUF quantization implementation with CTransformers integration."""
 
     def __init__(
         self,
@@ -27,41 +27,27 @@ def __init__(
         use_packed: bool = True,
         legacy_format: bool = False,
         batch_size: int = 4,
-        cpu_offload: bool = False
+        device: Optional[Union[str, torch.device]] = None
     ):
         if not CT_AVAILABLE:
             raise ImportError("CTransformers is required for GGUF quantization. Install with: pip install ctransformers")
-        
-        self.model = model
-        self.bits = bits
+            
+        super().__init__(model=model, bits=bits, device=device)
         self.group_size = group_size
         self.desc_act = desc_act
         self.desc_ten = desc_ten
         self.use_packed = use_packed
         self.legacy_format = legacy_format
         self.batch_size = batch_size
-        self.cpu_offload = cpu_offload
 
-    def _clear_memory(self):
-        """Clear CUDA memory and run garbage collection."""
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            
     def quantize(
         self,
         calibration_data: Optional[torch.Tensor] = None
     ) -> PreTrainedModel:
-        """
-        Quantize model using GGUF format with memory-efficient processing.
-        
-        Args:
-            calibration_data: Optional tensor for computing quantization statistics
-            
-        Returns:
-            Quantized model
-        """
-        # Prepare model for quantization
+        """Quantize model using GGUF format with memory-efficient processing."""
+        # Prepare model and calibration data
+        if calibration_data is not None:
+            calibration_data = self.prepare_calibration_data(calibration_data)
         self.model.eval()
 
         # Collect statistics if provided
@@ -72,19 +58,11 @@ def quantize(
         # Convert linear layers to quantized versions
         for name, module in self.model.named_modules():
             if isinstance(module, nn.Linear):
-                print(f"Processing layer: {name}")
+                self.logger.info(f"Processing layer: {name}")
 
                 # Create quantized layer
                 layer_stats = stats.get(name, None)
-                
-                # Move stats to appropriate device
-                if layer_stats is not None and self.cpu_offload:
-                    layer_stats = {k: v.to('cpu') for k, v in layer_stats.items()}
-                
-                quantized = self._quantize_layer(
-                    module,
-                    {k: v.to(module.weight.device) for k, v in layer_stats.items()} if self.cpu_offload and layer_stats else layer_stats
-                )
+                quantized = self._quantize_layer(module, layer_stats)
 
                 # Replace layer in model
                 parent_name = '.'.join(name.split('.')[:-1])
@@ -95,7 +73,6 @@ def quantize(
                 else:
                     setattr(self.model, name, quantized)
 
-                # Clear memory after processing each layer
                 self._clear_memory()
 
         return self.model
@@ -325,7 +302,7 @@ def _quantize_layer(
                 quantized.input_std.copy_(stats["std"].to(target_device))
 
         return quantized
-          def convert_to_gguf(self, output_path: str):
+    def convert_to_gguf(self, output_path: str):
         """Convert quantized model to GGUF format using CTransformers."""
         if not CT_AVAILABLE:
             raise ImportError("CTransformers is required for GGUF conversion")
 
@@ -6,9 +6,9 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union
 from transformers import PreTrainedModel
-from .quantization_engine import QuantizationConfig, QuantizedLinear, DeviceManager
+from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
 
-class GPTQQuantizer:
+class GPTQQuantizer(BaseQuantizer):
     """GPTQ quantization implementation with memory-efficient processing."""
 
     def __init__(
@@ -24,8 +24,7 @@ def __init__(
         batch_size: int = 4,
         device: Optional[Union[str, torch.device]] = None
     ):
-        self.model = model
-        self.bits = bits
+        super().__init__(model=model, bits=bits, device=device)
         self.group_size = group_size
         self.actorder = actorder
         self.allow_mixed_bits = allow_mixed_bits
@@ -34,11 +33,6 @@ def __init__(
         self.sym = sym
         self.batch_size = batch_size
 
-        # Initialize device manager
-        self.device_manager = DeviceManager(
-            torch.device(device) if device else None
-        )
-        
         # Initialize H matrices for each layer
         self.H = {}
 
@@ -62,20 +56,14 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
         if calibration_data is None:
             raise ValueError("GPTQ requires calibration data for quantization")
 
-        device = self.device_manager.primary_device
-        self.model.to(device)
+        # Prepare model and data
+        calibration_data = self.prepare_for_quantization(calibration_data)
         self.model.eval()
 
-        # Convert calibration data to correct device
-        calibration_data = calibration_data.to(device)
-        
         # Process layers
         for name, module in self.model.named_modules():
             if isinstance(module, nn.Linear):
-                print(f"Processing layer: {name}")
-                
-                # Ensure layer is on correct device
-                module.to(device)
+                self.logger.info(f"Processing layer: {name}")
 
                 # Compute Hessian approximation
                 self.H[name] = self._compute_hessian(module, calibration_data)
@@ -92,8 +80,6 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
 
                 # Clear memory after processing each layer
                 self._clear_memory()
-                
-                # Remove processed Hessian
                 del self.H[name]
 
         return self.model