codewithdark-git
diff --git a/‎docs/api_reference/quantization.rst‎
Lines changed: 98 additions & 7 deletions b/‎docs/api_reference/quantization.rst‎
Lines changed: 98 additions & 7 deletions
diff --git a/‎docs/api_reference/trainer.rst‎
Lines changed: 62 additions & 0 deletions b/‎docs/api_reference/trainer.rst‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎docs/getting_started.rst‎
Lines changed: 51 additions & 14 deletions b/‎docs/getting_started.rst‎
Lines changed: 51 additions & 14 deletions
diff --git a/‎quantllm/__init__.py‎
Lines changed: 31 additions & 7 deletions b/‎quantllm/__init__.py‎
Lines changed: 31 additions & 7 deletions
@@ -1,16 +1,107 @@
-Easy Quantization with `QuantizerFactory`
-=========================================
+# QuantLLM: Advanced Model Quantization
+===================================
 
-The recommended way to quantize models with QuantLLM is by using the `QuantizerFactory.quantize_from_pretrained` static method. This high-level API simplifies the process of loading a model from Hugging Face, quantizing it using a specified method, and receiving the quantized model along with its tokenizer.
+💫 Introduction
+------------
+QuantLLM is a powerful library that provides state-of-the-art quantization methods to compress large language models while maintaining their performance. Supporting multiple quantization methods (AWQ, GPTQ, GGUF), it enables efficient model deployment in production environments.
+
+🚀 Getting Started
+---------------
+QuantLLM offers multiple quantization methods, each optimized for different use cases. The high-level `QuantLLM` API provides a simple interface to quantize models while the low-level API gives you fine-grained control over the quantization process.
+
+Key Features:
+- Multiple quantization methods (AWQ, GPTQ, GGUF)
+- Memory-efficient processing
+- Hardware-specific optimizations
+- Comprehensive metrics and logging
+- Easy model export and deployment
+
+Complete Example
+---------------
 
 .. code-block:: python
 
     import torch
-    from quantllm import QuantizerFactory # Assuming __init__.py is updated
+    from quantllm import QuantLLM
+    from transformers import AutoTokenizer
+    import time
+
+    # 1. Model and Method Selection
+    model_name = "facebook/opt-125m"  # Any Hugging Face model
+    method = "awq"  # Choose: 'awq', 'gptq', or 'gguf'
+
+    # 2. Configure Quantization
+    quant_config = {
+        "bits": 4,                # Quantization bits (2-8)
+        "group_size": 128,        # Size of quantization groups
+        "zero_point": True,       # Zero-point quantization (AWQ)
+        "version": "v2",          # AWQ algorithm version
+        "scale_dtype": "fp32",    # Scale factor data type
+        "batch_size": 4          # Processing batch size
+    }
+
+    # 3. Prepare Calibration Data
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    
+    # Representative text samples for calibration
+    calibration_texts = [
+        "Translate English to French: Hello, how are you?",
+        "Summarize this text: The quick brown fox jumps over the lazy dog",
+        "What is the capital of France?",
+        "Write a short story about a robot learning to paint.",
+        "Explain quantum computing in simple terms."
+    ]
+    
+    # Tokenize with proper padding and attention masks
+    inputs = tokenizer(
+        calibration_texts, 
+        padding=True,
+        truncation=True,
+        max_length=512,
+        return_tensors="pt"
+    )
 
-    # Example: Quantizing facebook/opt-125m using AWQ
-    model_name = "facebook/opt-125m"
-    method = "awq" # Can be 'awq', 'gptq', or 'gguf'
+    # 4. Model Quantization with Error Handling
+    try:
+        print("Starting quantization process...")
+        start_time = time.time()
+        
+        # Perform quantization
+        quantized_model, tokenizer = QuantLLM.quantize_from_pretrained(
+            model_name=model_name,
+            method=method,
+            quant_config_dict=quant_config,
+            calibration_data=inputs["input_ids"],
+            calibration_steps=50,
+            device="cuda" if torch.cuda.is_available() else "cpu"
+        )
+        
+        print(f"Quantization completed in {time.time() - start_time:.2f} seconds")
+        
+        # 5. Model Validation
+        test_input = "Translate this to French: The weather is beautiful today."
+        inputs = tokenizer(test_input, return_tensors="pt").to(quantized_model.device)
+        
+        with torch.no_grad():
+            outputs = quantized_model.generate(
+                **inputs,
+                max_length=50,
+                num_return_sequences=1,
+                temperature=0.7
+            )
+            
+        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print(f"Test Output: {result}")
+        
+        # 6. Save Quantized Model (Optional)
+        save_path = "./quantized_model"
+        quantized_model.save_pretrained(save_path)
+        tokenizer.save_pretrained(save_path)
+        print(f"Model saved to {save_path}")
+        
+    except Exception as e:
+        print(f"Error during quantization: {str(e)}")
+        raise
 
     # Define quantization configuration
     quant_config = {
 
@@ -1,6 +1,9 @@
 Trainer API
 ==========
 
+QuantLLM provides a comprehensive training API with built-in support for quantization, 
+efficient fine-tuning, and progress tracking.
+
 Fine-Tuning Trainer
 -----------------
 
@@ -28,6 +31,65 @@ Training Logger
 Example Usage
 -----------
 
+Complete Training Pipeline
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from quantllm import (
+        Model, ModelConfig, 
+        FineTuningTrainer, TrainingConfig,
+        TrainingLogger, CheckpointManager
+    )
+
+    # Initialize logger for beautiful progress display
+    logger = TrainingLogger()
+
+    # Configure model with advanced optimizations
+    config = ModelConfig(
+        model_name="facebook/opt-125m",
+        load_in_4bit=True,         # Memory efficient!
+        use_lora=True,             # Parameter efficient!
+        gradient_checkpointing=True # Training efficient!
+    )
+
+    # Initialize training with rich features
+    training_config = TrainingConfig(
+        learning_rate=2e-4,
+        num_epochs=3,
+        batch_size=8,
+        gradient_accumulation_steps=4,
+        # Advanced features
+        warmup_ratio=0.1,
+        evaluation_strategy="steps",
+        eval_steps=100,
+        save_strategy="epoch",
+        logging_steps=10,
+        # Mixed precision training
+        fp16=True,
+        # Multi-GPU support
+        ddp_find_unused_parameters=False
+    )
+
+    # Setup checkpointing
+    checkpoint_manager = CheckpointManager(
+        checkpoint_dir="./checkpoints",
+        save_total_limit=3
+    )
+
+    # Initialize and train
+    trainer = FineTuningTrainer(
+        model=model,
+        training_config=training_config,
+        train_dataloader=train_loader,
+        eval_dataloader=val_loader,
+        logger=logger,
+        checkpoint_manager=checkpoint_manager
+    )
+    
+    # Start training with full monitoring
+    trainer.train()
+
 Basic Training
 ~~~~~~~~~~~~
 
 
@@ -4,48 +4,85 @@ Getting Started
 Quick Start
 ----------
 
-This guide will help you get started with QuantLLM quickly. Here's a simple example of fine-tuning a language model:
+QuantLLM is designed to make working with large language models more accessible and efficient. Here's a complete example showcasing its key features:
 
 .. code-block:: python
 
     from quantllm import (
         Model, ModelConfig, 
         LoadDataset, DatasetConfig,
-        FineTuningTrainer, TrainingConfig
+        FineTuningTrainer, TrainingConfig,
+        TrainingLogger
     )
 
-    # 1. Load and configure model
+    # Initialize logger for rich progress tracking
+    logger = TrainingLogger()  # This will display the ASCII art logo!
+
+    # 1. Load and configure model with best practices
     model_config = ModelConfig(
         model_name="facebook/opt-125m",
-        load_in_4bit=True  # Enable 4-bit quantization
+        load_in_4bit=True,     # Enable memory-efficient 4-bit quantization
+        use_lora=True,         # Enable parameter-efficient fine-tuning
+        gradient_checkpointing=True  # Reduce memory usage during training
     )
     model = Model(model_config).get_model()
 
-    # 2. Load and prepare dataset
+    # 2. Load and prepare dataset with automatic preprocessing
     dataset = LoadDataset().load_hf_dataset("imdb")
+    dataset_config = DatasetConfig(
+        text_column="text",
+        label_column="label",
+        max_length=512
+    )
     
-    # 3. Configure training
+    # 3. Configure training with optimized defaults
     training_config = TrainingConfig(
         learning_rate=2e-4,
         num_epochs=3,
-        batch_size=8
+        batch_size=8,
+        gradient_accumulation_steps=4,  # For larger effective batch sizes
+        warmup_ratio=0.1,              # Gradual learning rate warmup
+        evaluation_strategy="steps",    # Regular evaluation during training
+        eval_steps=100
     )
 
-    # 4. Train model
+    # 4. Initialize trainer with progress tracking
     trainer = FineTuningTrainer(
         model=model,
-        training_config=training_config
+        training_config=training_config,
+        logger=logger  # Enable rich progress tracking
     )
+    
+    # 5. Start training with automatic hardware optimization
     trainer.train()
 
 Core Features
 ------------
 
-* **Efficient Quantization**: 4-bit and 8-bit quantization support
-* **Hardware Optimization**: Automatic hardware detection and optimization
-* **LoRA Integration**: Parameter-efficient fine-tuning
-* **Progress Tracking**: Rich logging and visualization
-* **Easy Deployment**: Simple export and deployment options
+* **Advanced Quantization**
+    * 4-bit and 8-bit quantization for up to 75% memory reduction
+    * Automatic format selection based on your hardware
+    * Zero-shot quantization with minimal accuracy loss
+
+* **Efficient Fine-tuning**
+    * LoRA support for parameter-efficient training
+    * Gradient checkpointing for reduced memory usage
+    * Automatic mixed precision training
+
+* **Hardware Optimization**
+    * Automatic hardware detection (CUDA, MPS, CPU)
+    * Optimal settings for your specific GPU
+    * CPU offloading for large models
+
+* **Rich Progress Tracking**
+    * Beautiful terminal-based progress display
+    * Detailed training metrics and logs
+    * Integration with WandB and TensorBoard
+
+* **Production Ready**
+    * Simple export to ONNX and TorchScript
+    * Quantized model deployment
+    * GPU and CPU inference optimization
 
 Key Concepts
 -----------
 
@@ -11,8 +11,23 @@
     TrainingLogger
 )
 from .hub import HubManager, CheckpointManager
-from .utils.optimizations import get_optimal_training_settings
-from .utils.log_config import configure_logging, enable_logging
+from .utils import (
+    get_optimal_training_settings,
+    configure_logging,
+    enable_logging,
+    QuantizationBenchmark
+)
+from .api import QuantLLM
+
+from .quant import (
+    QuantizationConfig, 
+    QuantizationEngine, 
+    QuantizedLinear, 
+    GGUFQuantizer, 
+    GPTQQuantizer, 
+    AWQQuantizer
+)
+
 
 from .config import (
     ModelConfig,
@@ -53,17 +68,26 @@
     "ModelConfig",
     "DatasetConfig",
     "TrainingConfig",
+    "QuantizationBenchmark",
 
     # Utilities
     "get_optimal_training_settings",
     "configure_logging",
     "enable_logging",
+
+    # Quantization
+    "QuantizationConfig",
+    "QuantizationEngine",
+    "QuantizedLinear",
+    "GGUFQuantizer",
+    "GPTQQuantizer",
+    "AWQQuantizer",
+
+    # API
+    "QuantLLM"
 ]
 
+
 # Initialize package-level logger with fancy welcome message
 logger = TrainingLogger()
-logger.log_success(f"""
-✨ QuantLLM v{__version__} initialized successfully ✨
-🚀 Efficient Quantized Language Model Fine-Tuning
-📚 Documentation: https://github.com/codewithdark-git/QuantLLM
-""")
+logger.log_welcome_message()