codewithdark-git
diff --git a/‎docs/api_reference/quantization.rst‎
Lines changed: 201 additions & 97 deletions b/‎docs/api_reference/quantization.rst‎
Lines changed: 201 additions & 97 deletions
diff --git a/‎quantllm/api.py‎
Lines changed: 124 additions & 0 deletions b/‎quantllm/api.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎quantllm/quant/awq.py‎
Lines changed: 18 additions & 4 deletions b/‎quantllm/quant/awq.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎quantllm/quant/gguf.py‎
Lines changed: 17 additions & 5 deletions b/‎quantllm/quant/gguf.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎quantllm/quant/gptq.py‎
Lines changed: 16 additions & 5 deletions b/‎quantllm/quant/gptq.py‎
Lines changed: 16 additions & 5 deletions
@@ -0,0 +1,124 @@
+from typing import Optional, Dict, Any, Tuple
+from transformers import PreTrainedModel
+from .quant.awq import AWQQuantizer
+from .quant.gptq import GPTQQuantizer
+from .quant.gguf import GGUFQuantizer
+from .trainer.logger import TrainingLogger
+
+class QuantizerFactory:
+    @staticmethod
+    def quantize_from_pretrained(
+        model_name_or_path: str,
+        method: str,
+        quant_config_dict: Optional[Dict[str, Any]] = None,
+        calibration_data: Optional[Any] = None, # Typically torch.Tensor or similar
+        calibration_steps: Optional[int] = 100, # Specific to AWQ's quantize method
+        device: Optional[str] = None # Explicit device control
+    ) -> Tuple[PreTrainedModel, Any]: # Returns (quantized_model, tokenizer)
+        """
+        Loads a model from Hugging Face, quantizes it using the specified method,
+        and returns the quantized model and its tokenizer.
+
+        Args:
+            model_name_or_path (str): Hugging Face model ID or local path.
+            method (str): Quantization method to use ('awq', 'gptq', 'gguf').
+            quant_config_dict (Optional[Dict[str, Any]]): Dictionary with quantization parameters.
+                Common keys: 'bits', 'group_size', 'batch_size' (for quantizer init).
+                AWQ specific: 'zero_point', 'awq_version' (maps to 'version' in AWQQuantizer).
+                GPTQ specific: 'actorder', 'percdamp', 'sym'.
+                GGUF specific: 'use_packed', 'cpu_offload', 'desc_act', 'desc_ten', 'legacy_format'.
+            calibration_data (Optional[Any]): Calibration data required for quantization.
+            calibration_steps (Optional[int]): Number of calibration steps, primarily for AWQ's
+                                              quantize() method. Defaults to 100.
+            device (Optional[str]): Device to run quantization on ('cpu', 'cuda', 'cuda:x'). 
+                                    If None, default device selection logic in BaseQuantizer is used.
+        
+        Returns:
+            Tuple[PreTrainedModel, Any]: The quantized model and its associated tokenizer.
+        
+        Raises:
+            ValueError: If an unsupported quantization method is specified or essential parameters are missing.
+            RuntimeError: If quantization fails for some reason.
+        """
+        logger = TrainingLogger() 
+        if quant_config_dict is None:
+            quant_config_dict = {}
+
+        method_lower = method.lower()
+        logger.log_info(f"Attempting to quantize model '{model_name_or_path}' using method: {method_lower}")
+
+        bits = quant_config_dict.get('bits', 4)
+        group_size = quant_config_dict.get('group_size', 128)
+        quantizer_batch_size = quant_config_dict.get('batch_size', 4) 
+        
+        quantizer = None
+
+        if method_lower == 'awq':
+            awq_zero_point = quant_config_dict.get('zero_point', True)
+            awq_version = quant_config_dict.get('awq_version', 'v2')
+
+            quantizer = AWQQuantizer(
+                model_or_model_name_or_path=model_name_or_path,
+                bits=bits,
+                group_size=group_size,
+                zero_point=awq_zero_point,
+                version=awq_version,
+                batch_size=quantizer_batch_size,
+                device=device
+            )
+            logger.log_info(f"Quantizing with AWQ... Bits: {bits}, Group Size: {group_size}, Zero Point: {awq_zero_point}, Version: {awq_version}")
+            quantizer.quantize( # Call quantize, model is updated in place
+                calibration_data=calibration_data,
+                calibration_steps=calibration_steps
+            )
+
+        elif method_lower == 'gptq':
+            gptq_actorder = quant_config_dict.get('actorder', True)
+            gptq_percdamp = quant_config_dict.get('percdamp', 0.01)
+            gptq_sym = quant_config_dict.get('sym', True)
+            
+            quantizer = GPTQQuantizer(
+                model_or_model_name_or_path=model_name_or_path,
+                bits=bits,
+                group_size=group_size,
+                actorder=gptq_actorder,
+                percdamp=gptq_percdamp,
+                sym=gptq_sym,
+                batch_size=quantizer_batch_size,
+                device=device
+            )
+            logger.log_info(f"Quantizing with GPTQ... Bits: {bits}, Group Size: {group_size}, ActOrder: {gptq_actorder}, Sym: {gptq_sym}")
+            quantizer.quantize(calibration_data=calibration_data) # Model updated in place
+
+        elif method_lower == 'gguf':
+            gguf_use_packed = quant_config_dict.get('use_packed', True)
+            gguf_cpu_offload = quant_config_dict.get('cpu_offload', False)
+            gguf_desc_act = quant_config_dict.get('desc_act', False)
+            gguf_desc_ten = quant_config_dict.get('desc_ten', False)
+            gguf_legacy_format = quant_config_dict.get('legacy_format', False)
+
+            quantizer = GGUFQuantizer(
+                model_or_model_name_or_path=model_name_or_path,
+                bits=bits,
+                group_size=group_size,
+                use_packed=gguf_use_packed,
+                cpu_offload=gguf_cpu_offload,
+                desc_act=gguf_desc_act,
+                desc_ten=gguf_desc_ten,
+                legacy_format=gguf_legacy_format,
+                batch_size=quantizer_batch_size,
+                device=device
+            )
+            logger.log_info(f"Quantizing with GGUF... Bits: {bits}, Group Size: {group_size}, Packed: {gguf_use_packed}, CPU Offload: {gguf_cpu_offload}")
+            quantizer.quantize(calibration_data=calibration_data) # Model updated in place
+
+        else:
+            logger.log_error(f"Unsupported quantization method: {method}")
+            raise ValueError(f"Unsupported quantization method: {method}. Supported methods are 'awq', 'gptq', 'gguf'.")
+
+        if quantizer is None or quantizer.model is None:
+             logger.log_error(f"Failed to initialize quantizer or obtain quantized model for method: {method}")
+             raise RuntimeError(f"Quantization failed for method: {method}. Quantizer or model is None.")
+        
+        logger.log_info(f"Successfully quantized model with method: {method_lower}")
+        return quantizer.model, quantizer.tokenizer
@@ -6,14 +6,14 @@
 import numpy as np
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 class AWQQuantizer(BaseQuantizer):
     """AWQ quantization implementation with memory-efficient processing."""
 
     def __init__(
         self,
-        model: PreTrainedModel,
+        model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
         bits: int = 4,
         group_size: int = 128, 
         zero_point: bool = True,
@@ -27,7 +27,8 @@ def __init__(
         Initializes the AWQQuantizer.
 
         Args:
-            model (PreTrainedModel): The model to be quantized.
+            model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+                The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
             bits (int, optional): Number of bits for quantization. Defaults to 4.
             group_size (int, optional): Size of the quantization group. Defaults to 128.
             zero_point (bool, optional): Whether to use zero-point quantization for activations. Defaults to True.
@@ -39,7 +40,9 @@ def __init__(
                 The device for quantization operations ('cpu', 'cuda', etc.). 
                 Inherited from BaseQuantizer. Defaults to None (auto-detection).
         """
-        super().__init__(model=model, bits=bits, device=device)
+        # Pass all relevant kwargs to BaseQuantizer
+        # AWQQuantizer specific args are handled here.
+        super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
         self.group_size = group_size
         self.zero_point = zero_point
         self.scale_dtype = scale_dtype
@@ -101,7 +104,18 @@ def quantize(
 
                 self._clear_memory()
 
+        # Update model config with quantization parameters
+        awq_specific_params = {
+            "zero_point": self.zero_point,
+            "version": self.version,
+            "scale_dtype": self.scale_dtype, # Added from __init__
+            "enable_mnn_kernel": self.enable_mnn_kernel # Added from __init__
+            # batch_size is more of a process param, not a model config param usually
+        }
+        self._update_model_config_with_quant_params("awq", awq_specific_params)
+        
         return self.model
+
     def _collect_activation_stats(
         self,
         data: torch.Tensor # Removed num_steps parameter
 
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union, Tuple
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 try:
     import ctransformers
@@ -19,7 +19,7 @@ class GGUFQuantizer(BaseQuantizer):
 
     def __init__(
         self,
-        model: PreTrainedModel,
+        model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
         bits: int = 4,
         group_size: int = 32,
         desc_act: bool = False,
@@ -34,7 +34,8 @@ def __init__(
         Initializes the GGUFQuantizer.
 
         Args:
-            model (PreTrainedModel): The model to be quantized.
+            model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+                The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
             bits (int, optional): Number of bits for quantization. Defaults to 4.
             group_size (int, optional): Size of the quantization group. Defaults to 32.
             desc_act (bool, optional): Whether to describe activations in GGUF metadata. Defaults to False.
@@ -52,7 +53,7 @@ def __init__(
         if not CT_AVAILABLE:
             raise ImportError("CTransformers is required for GGUF quantization. Install with: pip install ctransformers")
 
-        super().__init__(model=model, bits=bits, device=device)
+        super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
         self.group_size = group_size
         self.desc_act = desc_act
         self.desc_ten = desc_ten
@@ -94,9 +95,20 @@ def quantize(
                     setattr(self.model, name, quantized)
 
                 self._clear_memory()
+
+        # Update model config with quantization parameters
+        gguf_specific_params = {
+            "use_packed": self.use_packed,
+            "cpu_offload": self.cpu_offload,
+            "desc_act": self.desc_act,
+            "desc_ten": self.desc_ten,
+            "legacy_format": self.legacy_format
+            # group_size is handled by BaseQuantizer if present as self.group_size
+        }
+        self._update_model_config_with_quant_params("gguf", gguf_specific_params)
 
         return self.model
-    
+
     def _collect_stats(self, data: torch.Tensor) -> Dict[str, Dict[str, torch.Tensor]]:
         """Collect statistics for quantization with memory-efficient batch processing."""
         stats = {}
 
@@ -6,14 +6,14 @@
 import torch.nn as nn
 from typing import Optional, Dict, Any, List, Union
 from transformers import PreTrainedModel
-from .quantization_engine import BaseQuantizer, QuantizationConfig, QuantizedLinear
+from .quantization_engine import move_to_device, BaseQuantizer, QuantizationConfig, QuantizedLinear
 
 class GPTQQuantizer(BaseQuantizer):
     """GPTQ quantization implementation with memory-efficient processing."""
 
     def __init__(
         self,
-        model: PreTrainedModel,
+        model_or_model_name_or_path: Union[str, PreTrainedModel], # Changed parameter name
         bits: int = 4,
         group_size: int = 128,
         actorder: bool = False,
@@ -28,7 +28,8 @@ def __init__(
         Initializes the GPTQQuantizer.
 
         Args:
-            model (PreTrainedModel): The model to be quantized.
+            model_or_model_name_or_path (Union[str, PreTrainedModel]): 
+                The Hugging Face model name/path or a PreTrainedModel instance to be quantized.
             bits (int, optional): Number of bits for quantization. Defaults to 4.
             group_size (int, optional): Size of the quantization group. Defaults to 128.
             actorder (bool, optional): Whether to use activation order for columns. Defaults to False.
@@ -43,7 +44,7 @@ def __init__(
                 The device for quantization operations ('cpu', 'cuda', etc.). 
                 Inherited from BaseQuantizer. Defaults to None (auto-detection).
         """
-        super().__init__(model=model, bits=bits, device=device)
+        super().__init__(model_or_model_name_or_path=model_or_model_name_or_path, bits=bits, device=device)
         self.group_size = group_size
         self.actorder = actorder
         self.allow_mixed_bits = allow_mixed_bits
@@ -101,8 +102,18 @@ def quantize(self, calibration_data: Optional[torch.Tensor] = None) -> PreTraine
                 self._clear_memory()
                 del self.H[name]
 
+        # Update model config with quantization parameters
+        gptq_specific_params = {
+            "actorder": self.actorder,
+            "sym": self.sym,
+            "percdamp": self.percdamp,
+            "allow_mixed_bits": self.allow_mixed_bits # Added from __init__
+            # use_triton is more of a runtime/environment flag, might not be essential in model config
+        }
+        self._update_model_config_with_quant_params("gptq", gptq_specific_params)
+        
         return self.model
-    
+
     def _compute_hessian(self, layer: nn.Linear, data: torch.Tensor) -> torch.Tensor:
         """Compute Hessian approximation for a layer with memory-efficient processing."""
         n = layer.in_features