codewithdark-git
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎quantllm/api/high_level.py‎
Lines changed: 86 additions & 0 deletions b/‎quantllm/api/high_level.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎quantllm/api/low_level.py‎
Lines changed: 119 additions & 0 deletions b/‎quantllm/api/low_level.py‎
Lines changed: 119 additions & 0 deletions
@@ -8,6 +8,7 @@ __pycache__/
 *.dylib
 
 upcoming.md
+examples/
 
 logs
 main.py
 
@@ -0,0 +1,86 @@
+"""High-level API for QuantLLM - provides simple, user-friendly interfaces."""
+
+import torch
+from typing import Optional, Dict, Any, Tuple, Union
+from transformers import PreTrainedModel, PreTrainedTokenizer
+from ..model.model import Model
+from ..model.lora_config import LoraConfigManager
+from ..config.model_config import ModelConfig
+
+class QuantLLM:
+    """Main interface for QuantLLM, providing simplified model loading and training."""
+    
+    @staticmethod
+    def from_pretrained(
+        model_name: str,
+        *,
+        quant_bits: int = 4,
+        bnb_4bit_compute_dtype: str = "bfloat16",
+        max_seq_len: Optional[int] = None,
+        device_map: Union[str, Dict[str, str]] = "auto",
+        max_memory: Optional[Dict[str, str]] = None,
+        **kwargs
+    ) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+        """
+        Load a pre-trained model with optional quantization.
+        
+        Args:
+            model_name: Name or path of the model to load
+            quant_bits: Number of bits for quantization (4 or 8)
+            bnb_4bit_compute_dtype: Compute dtype for 4-bit quantization
+            max_seq_len: Maximum sequence length
+            device_map: Device mapping strategy or explicit mapping
+            max_memory: Maximum memory allocation per device
+            **kwargs: Additional arguments passed to from_pretrained
+            
+        Returns:
+            Tuple of (model, tokenizer)
+        """
+        config = ModelConfig(
+            model_name=model_name,
+            load_in_4bit=(quant_bits == 4),
+            load_in_8bit=(quant_bits == 8),
+            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+            device_map=device_map,
+            max_memory=max_memory,
+            kwargs=kwargs
+        )
+        
+        model_loader = Model(config)
+        return model_loader.get_model(), model_loader.get_tokenizer()
+    
+    @staticmethod
+    def get_adapter_model(
+        base_model: PreTrainedModel,
+        r: int = 16,
+        target_modules: Optional[list] = None,
+        lora_alpha: int = 16,
+        lora_dropout: float = 0.1,
+        bias: str = "none"
+    ) -> PreTrainedModel:
+        """
+        Attach LoRA adapters to a model for efficient fine-tuning.
+        
+        Args:
+            base_model: Base model to attach adapters to
+            r: LoRA attention dimension
+            target_modules: List of module names to apply LoRA to
+            lora_alpha: LoRA alpha parameter
+            lora_dropout: Dropout probability for LoRA layers
+            bias: Bias type ("none", "all", or "lora_only")
+            
+        Returns:
+            Model with LoRA adapters attached
+        """
+        from peft import prepare_model_for_kbit_training, get_peft_model
+        
+        lora_config = LoraConfigManager().create_custom_config(
+            r=r,
+            target_modules=target_modules,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias=bias
+        )
+        
+        model = prepare_model_for_kbit_training(base_model)
+        return get_peft_model(model, lora_config)
@@ -0,0 +1,119 @@
+"""Low-level API for QuantLLM - provides detailed control over model loading and quantization."""
+
+import torch
+from typing import Optional, Dict, Any, Tuple, Union, List
+from transformers import PreTrainedModel, PreTrainedTokenizer, BitsAndBytesConfig
+from ..model.model import Model
+from ..config.model_config import ModelConfig
+from ..quant.quantization_engine import QuantizationEngine
+from ..quant.kernels import TritonKernelManager
+
+class LowLevelQuantLLM:
+    """Low-level interface providing fine-grained control over model loading and quantization."""
+    
+    def __init__(self):
+        self.quant_engine = QuantizationEngine()
+        self.kernel_manager = TritonKernelManager()
+    
+    def load_model_advanced(
+        self,
+        model_name: str,
+        *,
+        quant_config: Optional[BitsAndBytesConfig] = None,
+        device_map: Union[str, Dict[str, str]] = "auto",
+        max_memory: Optional[Dict[str, str]] = None,
+        use_triton_kernels: bool = False,
+        optimize_layers: Optional[List[str]] = None,
+        **kwargs
+    ) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+        """
+        Load a model with detailed quantization and optimization controls.
+        
+        Args:
+            model_name: Model name or path
+            quant_config: Optional custom BitsAndBytes quantization config
+            device_map: Device mapping strategy
+            max_memory: Maximum memory per device
+            use_triton_kernels: Whether to use optimized Triton kernels
+            optimize_layers: List of layer names to optimize with Triton
+            **kwargs: Additional arguments for model loading
+            
+        Returns:
+            Tuple of (model, tokenizer)
+        """
+        config = ModelConfig(
+            model_name=model_name,
+            device_map=device_map,
+            max_memory=max_memory,
+            kwargs=kwargs
+        )
+        
+        if quant_config:
+            config.quantization_config = quant_config.to_dict()
+        
+        model_loader = Model(config)
+        model, tokenizer = model_loader.get_model(), model_loader.get_tokenizer()
+        
+        if use_triton_kernels:
+            model = self.kernel_manager.optimize_model(
+                model, 
+                target_modules=optimize_layers
+            )
+            
+        return model, tokenizer
+    
+    def quantize_model_weights(
+        self,
+        model: PreTrainedModel,
+        bits: int = 4,
+        group_size: int = 128,
+        compute_dtype: torch.dtype = torch.bfloat16,
+        use_double_quant: bool = True
+    ) -> PreTrainedModel:
+        """
+        Apply quantization to an existing model's weights.
+        
+        Args:
+            model: Model to quantize
+            bits: Number of bits for quantization
+            group_size: Size of quantization groups
+            compute_dtype: Compute dtype for operations
+            use_double_quant: Whether to use double quantization
+            
+        Returns:
+            Quantized model
+        """
+        return self.quant_engine.quantize_weights(
+            model,
+            bits=bits,
+            group_size=group_size,
+            compute_dtype=compute_dtype,
+            use_double_quant=use_double_quant
+        )
+    
+    def replace_layer_with_triton(
+        self,
+        model: PreTrainedModel,
+        layer_name: str,
+        kernel_type: str = "auto"
+    ) -> PreTrainedModel:
+        """
+        Replace a specific layer with its optimized Triton version.
+        
+        Args:
+            model: Model to modify
+            layer_name: Name of layer to replace
+            kernel_type: Type of Triton kernel to use
+            
+        Returns:
+            Model with replaced layer
+        """
+        return self.kernel_manager.replace_layer(
+            model,
+            layer_name=layer_name,
+            kernel_type=kernel_type
+        )
+    
+    def get_memory_stats(self, model: PreTrainedModel) -> Dict[str, Any]:
+        """Get detailed memory statistics for model."""
+        return self.quant_engine.get_memory_stats(model)