diff --git a/llm-training/README.md b/llm-training/README.md
new file mode 100644
index 000000000..3b987a3b0
--- /dev/null
+++ b/llm-training/README.md
@@ -0,0 +1,404 @@
+# Trainable LLM AI - Qwen2.5 1B Model Training
+
+Complete setup for training 1 billion parameter models using **Unsloth** - the fastest and most memory-efficient LLM training framework.
+
+## Features
+
+- **2x faster** training than Hugging Face
+- **70% less VRAM** usage
+- **Single GPU** training (6-8GB VRAM minimum)
+- **QLoRA, LoRA, and full fine-tuning** support
+- **Multi-capability**: Code, math, reasoning, multilingual
+
+## Quick Start
+
+### 1. System Requirements
+
+**Minimum:**
+- GPU: 6GB VRAM (e.g., RTX 2060, GTX 1660 Ti)
+- Python: 3.9 - 3.13
+- CUDA: 11.8 or 12.1+
+
+**Recommended:**
+- GPU: 12GB+ VRAM (e.g., RTX 3060, RTX 4060 Ti)
+- Python: 3.11
+- CUDA: 12.1
+
+**Check your system:**
+```bash
+python --version
+nvidia-smi
+nvcc --version  # Check CUDA version
+```
+
+### 2. Installation
+
+#### Step 1: Install PyTorch (if not installed)
+
+```bash
+# For CUDA 12.1 (recommended)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+# For CUDA 11.8
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+
+#### Step 2: Install Unsloth
+
+```bash
+pip install unsloth
+```
+
+**Or install from requirements file:**
+```bash
+cd llm-training
+pip install -r requirements.txt
+```
+
+#### Step 3: Verify Installation
+
+```bash
+python -c "from unsloth import FastLanguageModel; print('Unsloth installed successfully!')"
+```
+
+### 3. Train Your First Model
+
+```bash
+cd llm-training
+python train_qwen_1b.py
+```
+
+**Training will:**
+1. Download Qwen2.5-0.5B model (smallest 1B-class model)
+2. Apply QLoRA for efficient training
+3. Fine-tune on Alpaca dataset
+4. Save LoRA adapter and merged model
+5. Run inference test
+
+**Expected VRAM usage:**
+- Qwen2.5-0.5B: ~4-6GB
+- Qwen2.5-1.5B: ~6-8GB
+
+## Configuration
+
+Edit `train_qwen_1b.py` to customize:
+
+### Model Selection
+
+```python
+# Smallest (0.5B parameters) - Fastest training
+MODEL_NAME = "unsloth/Qwen2.5-0.5B-bnb-4bit"
+
+# Medium (1.5B parameters) - Better quality
+MODEL_NAME = "unsloth/Qwen2.5-1.5B-bnb-4bit"
+
+# Or use base models (non-quantized)
+MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+```
+
+### Training Mode
+
+```python
+# QLoRA (recommended - most efficient)
+LOAD_IN_4BIT = True
+FULL_FINETUNING = False
+
+# 8-bit training (alternative)
+LOAD_IN_8BIT = True
+LOAD_IN_4BIT = False
+FULL_FINETUNING = False
+
+# Full fine-tuning (requires 16GB+ VRAM)
+LOAD_IN_4BIT = False
+FULL_FINETUNING = True
+```
+
+### LoRA Parameters
+
+```python
+LORA_R = 16              # Rank: 8, 16, 32, 64 (higher = more capacity, more VRAM)
+LORA_ALPHA = 16          # Alpha: typically same as rank
+TARGET_MODULES = [       # Which layers to train
+    "q_proj", "k_proj", "v_proj", "o_proj",
+    "gate_proj", "up_proj", "down_proj",
+]
+```
+
+### Dataset Options
+
+#### Use Built-in Datasets
+
+```python
+# Alpaca (general instruction following)
+DATASET_NAME = "yahma/alpaca-cleaned"
+
+# Code training
+DATASET_NAME = "iamtarun/python_code_instructions_18k_alpaca"
+
+# Chat/conversation
+DATASET_NAME = "OpenAssistant/oasst1"
+
+# Medical
+DATASET_NAME = "medalpaca/medical_meadow_medical_flashcards"
+```
+
+#### Use Custom Dataset
+
+```python
+from datasets import load_dataset
+
+# From local JSON/CSV
+dataset = load_dataset("json", data_files="your_data.json")
+dataset = load_dataset("csv", data_files="your_data.csv")
+
+# From Hugging Face Hub
+dataset = load_dataset("your-username/your-dataset")
+```
+
+**Your dataset should have this format:**
+```json
+[
+  {
+    "instruction": "Write a function to...",
+    "input": "",
+    "output": "def function():\n    ..."
+  }
+]
+```
+
+## Advanced Usage
+
+### 1. Multi-GPU Training
+
+```python
+# In train_qwen_1b.py, add to SFTConfig:
+args = SFTConfig(
+    # ... other args ...
+    ddp_find_unused_parameters=False,  # For multi-GPU
+)
+```
+
+```bash
+# Run with torchrun
+torchrun --nproc_per_node=2 train_qwen_1b.py
+```
+
+### 2. Save to GGUF (for llama.cpp)
+
+```python
+# Add after training
+model.save_pretrained_gguf(
+    "outputs/qwen-gguf",
+    tokenizer,
+    quantization_method="q4_k_m",  # or "q8_0", "q5_k_m"
+)
+```
+
+### 3. Export to GPTQ (for vLLM)
+
+```python
+model.save_pretrained_merged(
+    "outputs/qwen-gptq",
+    tokenizer,
+    save_method="gptq",
+    quantization_method="gptq_4bit",
+)
+```
+
+### 4. Inference Only
+
+```python
+from unsloth import FastLanguageModel
+
+# Load your trained model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="./outputs/qwen-finetuned/merged_model",
+    max_seq_length=2048,
+    dtype=None,
+    load_in_4bit=True,
+)
+
+# Enable inference mode
+FastLanguageModel.for_inference(model)
+
+# Generate
+inputs = tokenizer("Your prompt here", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=256)
+print(tokenizer.decode(outputs[0]))
+```
+
+### 5. Continue Training from Checkpoint
+
+```python
+# In train_qwen_1b.py, add to SFTConfig:
+args = SFTConfig(
+    # ... other args ...
+    resume_from_checkpoint="./outputs/qwen-finetuned/checkpoint-500",
+)
+```
+
+### 6. Experiment Tracking
+
+**Weights & Biases:**
+```bash
+pip install wandb
+wandb login
+```
+
+```python
+# In train_qwen_1b.py, add to SFTConfig:
+args = SFTConfig(
+    # ... other args ...
+    report_to=["wandb"],
+)
+```
+
+**TensorBoard:**
+```bash
+pip install tensorboard
+tensorboard --logdir=./outputs/qwen-finetuned
+```
+
+```python
+args = SFTConfig(
+    # ... other args ...
+    report_to=["tensorboard"],
+)
+```
+
+## Model Comparison
+
+| Model | Params | VRAM (4-bit) | Speed | Quality | Best For |
+|-------|--------|--------------|-------|---------|----------|
+| Qwen2.5-0.5B | 0.5B | 4-6GB | Fastest | Good | Testing, simple tasks |
+| Qwen2.5-1.5B | 1.5B | 6-8GB | Fast | Better | General use, code |
+| Qwen2.5-3B | 3B | 10-12GB | Medium | Great | Complex reasoning |
+| Qwen2.5-7B | 7B | 16-20GB | Slower | Excellent | Production |
+
+## Troubleshooting
+
+### Out of Memory (OOM)
+
+1. **Reduce batch size:**
+   ```python
+   PER_DEVICE_BATCH_SIZE = 1
+   GRADIENT_ACCUMULATION_STEPS = 8
+   ```
+
+2. **Reduce sequence length:**
+   ```python
+   MAX_SEQ_LENGTH = 1024  # or 512
+   ```
+
+3. **Use smaller LoRA rank:**
+   ```python
+   LORA_R = 8
+   ```
+
+4. **Enable gradient checkpointing:**
+   ```python
+   use_gradient_checkpointing = "unsloth"  # Already enabled
+   ```
+
+### Slow Training
+
+1. **Check GPU utilization:**
+   ```bash
+   watch -n 1 nvidia-smi
+   ```
+
+2. **Enable packing (for short sequences):**
+   ```python
+   packing = True  # In SFTConfig
+   ```
+
+3. **Increase batch size:**
+   ```python
+   PER_DEVICE_BATCH_SIZE = 4  # If you have VRAM
+   ```
+
+### Installation Issues
+
+**Windows users:**
+- Install Visual Studio C++
+- Install CUDA Toolkit
+- Use `pip install unsloth` (requires PyTorch pre-installed)
+
+**Linux/Mac:**
+- Use `pip install unsloth` directly
+
+**CUDA version mismatch:**
+```bash
+# Check your CUDA version
+nvcc --version
+
+# Install matching PyTorch
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+```
+
+## Resources
+
+- **Unsloth Docs**: https://docs.unsloth.ai
+- **Unsloth GitHub**: https://github.com/unslothai/unsloth
+- **Qwen Models**: https://huggingface.co/Qwen
+- **Example Notebooks**: https://github.com/unslothai/notebooks
+- **Discord Support**: https://discord.com/invite/unsloth
+
+## Performance Benchmarks
+
+**Training Speed (vs Hugging Face + Flash Attention):**
+- Qwen2.5-1.5B: **2x faster**, 70% less VRAM
+- Context length: **12x longer** on same GPU
+
+**Single GPU Training Capacity:**
+
+| GPU | VRAM | Max Model | Max Context |
+|-----|------|-----------|-------------|
+| RTX 2060 | 6GB | 1.5B (4-bit) | 4K |
+| RTX 3060 | 12GB | 3B (4-bit) | 16K |
+| RTX 4090 | 24GB | 7B (4-bit) | 64K |
+| A100 | 80GB | 70B (4-bit) | 128K |
+
+## Example: Train a Code Assistant
+
+```python
+# In train_qwen_1b.py, modify:
+
+# Use code dataset
+DATASET_NAME = "iamtarun/python_code_instructions_18k_alpaca"
+
+# Adjust for code generation
+MAX_SEQ_LENGTH = 4096  # Longer for code
+MAX_STEPS = 1000       # More steps
+
+# Run training
+# python train_qwen_1b.py
+```
+
+## License
+
+This training setup uses:
+- **Unsloth**: Apache 2.0 License
+- **Qwen2.5 Models**: Apache 2.0 License (check specific model cards)
+
+## Citation
+
+If you use this in your research:
+
+```bibtex
+@software{unsloth,
+  author = {Daniel Han, Michael Han and Unsloth team},
+  title = {Unsloth},
+  url = {http://github.com/unslothai/unsloth},
+  year = {2023}
+}
+```
+
+## Contributing
+
+Found a bug or want to improve the training script? Open an issue or PR!
+
+---
+
+**Happy Training! 🦥**
diff --git a/llm-training/config.py b/llm-training/config.py
new file mode 100755
index 000000000..3a4325663
--- /dev/null
+++ b/llm-training/config.py
@@ -0,0 +1,192 @@
+"""
+Training Configuration Template
+================================
+
+Copy this file and modify for your use case:
+  cp config.py my_config.py
+
+Then import in your training script:
+  from my_config import TrainingConfig
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class TrainingConfig:
+    """Configuration for LLM fine-tuning with Unsloth"""
+
+    # =========================================================================
+    # Model Configuration
+    # =========================================================================
+
+    # Model to fine-tune
+    model_name: str = "unsloth/Qwen2.5-0.5B-bnb-4bit"
+
+    # Available Qwen models:
+    # - "unsloth/Qwen2.5-0.5B-bnb-4bit"     (0.5B, 4-bit quantized)
+    # - "unsloth/Qwen2.5-1.5B-bnb-4bit"     (1.5B, 4-bit quantized)
+    # - "Qwen/Qwen2.5-0.5B-Instruct"        (0.5B, base)
+    # - "Qwen/Qwen2.5-1.5B-Instruct"        (1.5B, base)
+    # - "Qwen/Qwen2.5-3B-Instruct"          (3B, base)
+    # - "Qwen/Qwen2.5-7B-Instruct"          (7B, base)
+
+    max_seq_length: int = 2048  # Max context window (up to 32K for Qwen)
+    dtype: Optional[str] = None  # Auto-detect optimal dtype
+    load_in_4bit: bool = True    # 4-bit quantization (recommended)
+    load_in_8bit: bool = False   # 8-bit quantization
+    full_finetuning: bool = False  # Full parameter fine-tuning
+
+    # =========================================================================
+    # LoRA Configuration
+    # =========================================================================
+
+    lora_r: int = 16  # LoRA rank (8, 16, 32, 64)
+    lora_alpha: int = 16  # LoRA alpha (typically = lora_r)
+    lora_dropout: float = 0.0  # Dropout (0 is optimized)
+
+    target_modules: List[str] = None  # Will be set in __post_init__
+
+    def __post_init__(self):
+        if self.target_modules is None:
+            self.target_modules = [
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+            ]
+
+    # =========================================================================
+    # Training Hyperparameters
+    # =========================================================================
+
+    # Batch size and gradient accumulation
+    per_device_train_batch_size: int = 2
+    gradient_accumulation_steps: int = 4
+    # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
+    # = 2 * 4 = 8
+
+    # Optimization
+    learning_rate: float = 2e-4
+    weight_decay: float = 0.01
+    warmup_steps: int = 10
+    max_steps: int = 500  # -1 for full dataset
+    lr_scheduler_type: str = "linear"  # or "cosine", "constant"
+    optim: str = "adamw_8bit"  # or "adamw_torch", "sgd"
+
+    # Precision
+    # fp16/bf16 will be auto-detected based on GPU capability
+
+    # =========================================================================
+    # Dataset Configuration
+    # =========================================================================
+
+    dataset_name: str = "yahma/alpaca-cleaned"
+
+    # Popular datasets:
+    # - "yahma/alpaca-cleaned"                           (General instruction)
+    # - "iamtarun/python_code_instructions_18k_alpaca"   (Python code)
+    # - "OpenAssistant/oasst1"                           (Chat)
+    # - "timdettmers/openassistant-guanaco"              (Chat)
+    # - "medalpaca/medical_meadow_medical_flashcards"    (Medical)
+    # - "databricks/databricks-dolly-15k"                (General)
+
+    dataset_split: str = "train"
+    dataset_text_field: str = "text"
+
+    # =========================================================================
+    # Logging and Checkpointing
+    # =========================================================================
+
+    output_dir: str = "./outputs/qwen-finetuned"
+    logging_steps: int = 1
+    save_steps: int = 100
+    save_total_limit: int = 3  # Keep only last 3 checkpoints
+
+    # Experiment tracking
+    report_to: List[str] = None  # ["tensorboard"] or ["wandb"]
+
+    # =========================================================================
+    # Other Settings
+    # =========================================================================
+
+    seed: int = 3407
+    packing: bool = False  # Pack short sequences for efficiency
+
+    # Gradient checkpointing (memory saving)
+    use_gradient_checkpointing: str = "unsloth"  # or True, False
+
+
+# ============================================================================
+# Preset Configurations
+# ============================================================================
+
+class FastTrainingConfig(TrainingConfig):
+    """Fast training for testing/prototyping"""
+    model_name: str = "unsloth/Qwen2.5-0.5B-bnb-4bit"
+    max_steps: int = 100
+    save_steps: int = 50
+    lora_r: int = 8
+
+
+class BalancedConfig(TrainingConfig):
+    """Balanced speed/quality for general use"""
+    model_name: str = "unsloth/Qwen2.5-1.5B-bnb-4bit"
+    max_steps: int = 500
+    lora_r: int = 16
+    max_seq_length: int = 2048
+
+
+class QualityConfig(TrainingConfig):
+    """Higher quality training (slower, more VRAM)"""
+    model_name: str = "unsloth/Qwen2.5-1.5B-bnb-4bit"
+    max_steps: int = 1000
+    lora_r: int = 32
+    max_seq_length: int = 4096
+    per_device_train_batch_size: int = 1
+    gradient_accumulation_steps: int = 8
+
+
+class CodeTrainingConfig(TrainingConfig):
+    """Optimized for code generation"""
+    model_name: str = "unsloth/Qwen2.5-1.5B-bnb-4bit"
+    dataset_name: str = "iamtarun/python_code_instructions_18k_alpaca"
+    max_seq_length: int = 4096
+    max_steps: int = 1000
+    lora_r: int = 32
+
+
+class ChatConfig(TrainingConfig):
+    """Optimized for conversational AI"""
+    model_name: str = "unsloth/Qwen2.5-1.5B-bnb-4bit"
+    dataset_name: str = "OpenAssistant/oasst1"
+    max_seq_length: int = 2048
+    max_steps: int = 1000
+
+
+# ============================================================================
+# Usage Example
+# ============================================================================
+
+if __name__ == "__main__":
+    # Default config
+    config = TrainingConfig()
+    print("Default Config:")
+    print(f"  Model: {config.model_name}")
+    print(f"  Steps: {config.max_steps}")
+    print(f"  LoRA rank: {config.lora_r}")
+    print()
+
+    # Fast training config
+    fast_config = FastTrainingConfig()
+    print("Fast Config:")
+    print(f"  Model: {fast_config.model_name}")
+    print(f"  Steps: {fast_config.max_steps}")
+    print(f"  LoRA rank: {fast_config.lora_r}")
+    print()
+
+    # Code training config
+    code_config = CodeTrainingConfig()
+    print("Code Config:")
+    print(f"  Model: {code_config.model_name}")
+    print(f"  Dataset: {code_config.dataset_name}")
+    print(f"  Context: {code_config.max_seq_length}")
diff --git a/llm-training/inference.py b/llm-training/inference.py
new file mode 100755
index 000000000..407f3aa51
--- /dev/null
+++ b/llm-training/inference.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Inference with Trained Qwen Models
+===================================
+
+Load and run inference with your fine-tuned models.
+
+Usage:
+  python inference.py --model ./outputs/qwen-finetuned/merged_model
+  python inference.py --model unsloth/Qwen2.5-0.5B-bnb-4bit --interactive
+"""
+
+import argparse
+import torch
+from unsloth import FastLanguageModel
+
+
+def load_model(model_path, max_seq_length=2048, load_in_4bit=True):
+    """Load model for inference"""
+    print(f"Loading model from: {model_path}")
+
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_path,
+        max_seq_length=max_seq_length,
+        dtype=None,
+        load_in_4bit=load_in_4bit,
+    )
+
+    # Enable fast inference
+    FastLanguageModel.for_inference(model)
+
+    print(f"✓ Model loaded successfully")
+    return model, tokenizer
+
+
+def generate_response(
+    model,
+    tokenizer,
+    prompt,
+    max_new_tokens=256,
+    temperature=0.7,
+    top_p=0.9,
+    do_sample=True,
+):
+    """Generate response for a prompt"""
+    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=do_sample,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response
+
+
+def interactive_mode(model, tokenizer):
+    """Interactive chat with the model"""
+    print("\n" + "="*80)
+    print("INTERACTIVE MODE")
+    print("="*80)
+    print("Type your prompts below. Commands:")
+    print("  /exit - Exit interactive mode")
+    print("  /clear - Clear conversation")
+    print("  /params - Change generation parameters")
+    print("="*80 + "\n")
+
+    # Generation parameters
+    params = {
+        "max_new_tokens": 256,
+        "temperature": 0.7,
+        "top_p": 0.9,
+    }
+
+    conversation_history = []
+
+    while True:
+        try:
+            user_input = input("You: ").strip()
+
+            if not user_input:
+                continue
+
+            # Commands
+            if user_input == "/exit":
+                print("Goodbye!")
+                break
+
+            elif user_input == "/clear":
+                conversation_history = []
+                print("Conversation cleared.")
+                continue
+
+            elif user_input == "/params":
+                print(f"\nCurrent parameters:")
+                print(f"  max_new_tokens: {params['max_new_tokens']}")
+                print(f"  temperature: {params['temperature']}")
+                print(f"  top_p: {params['top_p']}")
+
+                try:
+                    params['max_new_tokens'] = int(input("max_new_tokens (64-512): ") or params['max_new_tokens'])
+                    params['temperature'] = float(input("temperature (0.1-2.0): ") or params['temperature'])
+                    params['top_p'] = float(input("top_p (0.1-1.0): ") or params['top_p'])
+                    print("✓ Parameters updated")
+                except:
+                    print("Invalid input, keeping current parameters")
+                continue
+
+            # Generate response
+            conversation_history.append(f"User: {user_input}")
+            prompt = "\n".join(conversation_history) + "\nAssistant:"
+
+            response = generate_response(
+                model, tokenizer, prompt,
+                max_new_tokens=params['max_new_tokens'],
+                temperature=params['temperature'],
+                top_p=params['top_p'],
+            )
+
+            # Extract assistant's response
+            assistant_response = response[len(prompt):].strip()
+            conversation_history.append(f"Assistant: {assistant_response}")
+
+            print(f"Assistant: {assistant_response}\n")
+
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+
+
+def batch_mode(model, tokenizer, prompts_file):
+    """Process multiple prompts from a file"""
+    print(f"Processing prompts from: {prompts_file}")
+
+    with open(prompts_file, 'r') as f:
+        prompts = [line.strip() for line in f if line.strip()]
+
+    results = []
+    for i, prompt in enumerate(prompts, 1):
+        print(f"\n[{i}/{len(prompts)}] Processing: {prompt[:50]}...")
+        response = generate_response(model, tokenizer, prompt)
+        results.append({"prompt": prompt, "response": response})
+
+    # Save results
+    output_file = prompts_file.replace(".txt", "_responses.txt")
+    with open(output_file, 'w') as f:
+        for result in results:
+            f.write(f"PROMPT: {result['prompt']}\n")
+            f.write(f"RESPONSE: {result['response']}\n")
+            f.write("-" * 80 + "\n")
+
+    print(f"\n✓ Results saved to: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Inference with Qwen models")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="./outputs/qwen-finetuned/merged_model",
+        help="Path to model or Hugging Face model name"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Single prompt to generate response for"
+    )
+    parser.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Launch interactive chat mode"
+    )
+    parser.add_argument(
+        "--batch",
+        type=str,
+        help="Path to file with prompts (one per line)"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=256,
+        help="Maximum tokens to generate"
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=0.9,
+        help="Top-p sampling"
+    )
+    parser.add_argument(
+        "--4bit",
+        dest="load_in_4bit",
+        action="store_true",
+        default=True,
+        help="Load in 4-bit mode (default)"
+    )
+
+    args = parser.parse_args()
+
+    # Check GPU
+    if not torch.cuda.is_available():
+        print("WARNING: No GPU detected. Inference will be very slow.")
+
+    # Load model
+    model, tokenizer = load_model(
+        args.model,
+        load_in_4bit=args.load_in_4bit
+    )
+
+    # Run appropriate mode
+    if args.interactive:
+        interactive_mode(model, tokenizer)
+
+    elif args.batch:
+        batch_mode(model, tokenizer, args.batch)
+
+    elif args.prompt:
+        print(f"\nPrompt: {args.prompt}\n")
+        response = generate_response(
+            model, tokenizer, args.prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+        )
+        print(f"Response:\n{response}\n")
+
+    else:
+        # Default: single example
+        example_prompt = "Write a Python function to calculate Fibonacci numbers."
+        print(f"\nExample prompt: {example_prompt}\n")
+        response = generate_response(
+            model, tokenizer, example_prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+        )
+        print(f"Response:\n{response}\n")
+        print("Tip: Use --interactive for chat mode or --prompt for custom prompts")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm-training/quickstart.py b/llm-training/quickstart.py
new file mode 100755
index 000000000..b3445b9e3
--- /dev/null
+++ b/llm-training/quickstart.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+Quickstart: Train Qwen2.5 in 5 Minutes
+=======================================
+
+Minimal example to get started with Unsloth training.
+
+Run: python quickstart.py
+"""
+
+from unsloth import FastLanguageModel
+import torch
+
+print("="*80)
+print("QWEN 2.5 QUICKSTART TRAINING")
+print("="*80)
+
+# Step 1: Check GPU
+print("\n[1/6] Checking GPU...")
+if not torch.cuda.is_available():
+    print("ERROR: No GPU detected. This script requires a CUDA-capable GPU.")
+    exit(1)
+print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
+print(f"✓ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+
+# Step 2: Load Model
+print("\n[2/6] Loading model (this may take a few minutes)...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/Qwen2.5-0.5B-bnb-4bit",  # Smallest model
+    max_seq_length=512,  # Short for fast training
+    dtype=None,
+    load_in_4bit=True,
+)
+print("✓ Model loaded")
+
+# Step 3: Apply LoRA
+print("\n[3/6] Applying LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=8,  # Small rank for speed
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    lora_alpha=8,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+print("✓ LoRA applied")
+
+# Step 4: Prepare Tiny Dataset
+print("\n[4/6] Preparing dataset...")
+from datasets import Dataset
+
+# Create a tiny demo dataset
+data = {
+    "text": [
+        "Explain AI in simple terms.\nAI is artificial intelligence, the simulation of human intelligence by machines.</s>",
+        "What is Python?\nPython is a high-level programming language known for its simplicity.</s>",
+        "How does machine learning work?\nMachine learning uses data to train models that can make predictions.</s>",
+    ]
+}
+dataset = Dataset.from_dict(data)
+print(f"✓ Created dataset with {len(dataset)} examples")
+
+# Step 5: Train
+print("\n[5/6] Training (this will be quick with only 10 steps)...")
+from trl import SFTTrainer, SFTConfig
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+    args=SFTConfig(
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        warmup_steps=2,
+        max_steps=10,  # Very short for demo
+        learning_rate=2e-4,
+        logging_steps=1,
+        output_dir="./outputs/quickstart",
+        optim="adamw_8bit",
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        max_seq_length=512,
+        dataset_text_field="text",
+    ),
+)
+
+trainer.train()
+print("✓ Training complete")
+
+# Step 6: Test
+print("\n[6/6] Testing inference...")
+FastLanguageModel.for_inference(model)
+
+prompt = "What is machine learning?"
+inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=64,
+    temperature=0.7,
+    do_sample=True,
+)
+
+result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+print("\n" + "="*80)
+print("TEST OUTPUT:")
+print("="*80)
+print(f"Prompt: {prompt}")
+print(f"Response: {result[len(prompt):]}")
+print("="*80)
+
+print("\n✓ SUCCESS! Your model is trained and working.")
+print("\nNext steps:")
+print("1. Run full training: python train_qwen_1b.py")
+print("2. Customize config: edit config.py")
+print("3. Read docs: cat README.md")
diff --git a/llm-training/requirements.txt b/llm-training/requirements.txt
new file mode 100644
index 000000000..55eb5133b
--- /dev/null
+++ b/llm-training/requirements.txt
@@ -0,0 +1,18 @@
+# Unsloth - Efficient LLM Training
+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
+
+# Core dependencies (installed with unsloth)
+# torch>=2.1.0
+# transformers>=4.51.3
+# trl>=0.18.2
+# peft>=0.7.1
+# datasets>=3.4.1
+# accelerate>=0.34.1
+
+# Optional: For experiment tracking
+# wandb
+# tensorboard
+
+# Optional: For better performance
+# flash-attn  # Requires specific CUDA version
+# xformers    # Installed automatically with unsloth
diff --git a/llm-training/setup.sh b/llm-training/setup.sh
new file mode 100755
index 000000000..2479aeb35
--- /dev/null
+++ b/llm-training/setup.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+#
+# Unsloth + Qwen2.5 Training Setup Script
+# ========================================
+#
+# This script sets up the environment for training 1B parameter models
+#
+# Usage: bash setup.sh
+#
+
+set -e  # Exit on error
+
+echo "========================================================================"
+echo "UNSLOTH + QWEN2.5 TRAINING SETUP"
+echo "========================================================================"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Check Python version
+echo -e "\n${YELLOW}[1/6] Checking Python version...${NC}"
+PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
+PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
+PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
+
+if [ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -ge 9 ] && [ "$PYTHON_MINOR" -lt 14 ]; then
+    echo -e "${GREEN}✓ Python $PYTHON_VERSION (supported)${NC}"
+else
+    echo -e "${RED}✗ Python $PYTHON_VERSION (not supported)${NC}"
+    echo "Please use Python 3.9 to 3.13"
+    exit 1
+fi
+
+# Check CUDA
+echo -e "\n${YELLOW}[2/6] Checking CUDA...${NC}"
+if command -v nvidia-smi &> /dev/null; then
+    CUDA_VERSION=$(nvcc --version 2>/dev/null | grep "release" | awk '{print $5}' | cut -d',' -f1)
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)
+    GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -n 1)
+
+    echo -e "${GREEN}✓ GPU: $GPU_NAME${NC}"
+    echo -e "${GREEN}✓ VRAM: $GPU_MEMORY${NC}"
+
+    if [ -n "$CUDA_VERSION" ]; then
+        echo -e "${GREEN}✓ CUDA: $CUDA_VERSION${NC}"
+    else
+        echo -e "${YELLOW}⚠ CUDA toolkit not found (nvcc)${NC}"
+        echo "  You may need to install it for compilation"
+    fi
+else
+    echo -e "${RED}✗ No NVIDIA GPU detected${NC}"
+    echo "This setup requires a CUDA-capable GPU"
+    exit 1
+fi
+
+# Check PyTorch
+echo -e "\n${YELLOW}[3/6] Checking PyTorch...${NC}"
+if python -c "import torch; print(torch.__version__)" &> /dev/null; then
+    TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
+    TORCH_CUDA=$(python -c "import torch; print(torch.version.cuda)")
+    echo -e "${GREEN}✓ PyTorch $TORCH_VERSION (CUDA $TORCH_CUDA)${NC}"
+
+    # Test CUDA
+    if python -c "import torch; assert torch.cuda.is_available()" &> /dev/null; then
+        echo -e "${GREEN}✓ PyTorch can access GPU${NC}"
+    else
+        echo -e "${RED}✗ PyTorch cannot access GPU${NC}"
+        echo "Reinstalling PyTorch..."
+
+        # Detect CUDA version and install appropriate PyTorch
+        if [[ "$CUDA_VERSION" == 12.* ]]; then
+            echo "Installing PyTorch for CUDA 12.1..."
+            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+        elif [[ "$CUDA_VERSION" == 11.8* ]]; then
+            echo "Installing PyTorch for CUDA 11.8..."
+            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+        else
+            echo -e "${YELLOW}⚠ Unknown CUDA version, installing default PyTorch${NC}"
+            pip install torch torchvision torchaudio
+        fi
+    fi
+else
+    echo -e "${YELLOW}⚠ PyTorch not found, installing...${NC}"
+
+    # Install PyTorch based on CUDA version
+    if [[ "$CUDA_VERSION" == 12.* ]]; then
+        echo "Installing PyTorch for CUDA 12.1..."
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+    elif [[ "$CUDA_VERSION" == 11.8* ]]; then
+        echo "Installing PyTorch for CUDA 11.8..."
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+    else
+        echo "Installing default PyTorch..."
+        pip install torch torchvision torchaudio
+    fi
+fi
+
+# Install Unsloth
+echo -e "\n${YELLOW}[4/6] Installing Unsloth...${NC}"
+if python -c "from unsloth import FastLanguageModel" &> /dev/null; then
+    UNSLOTH_VERSION=$(python -c "from unsloth.models._utils import __version__; print(__version__)" 2>/dev/null || echo "unknown")
+    echo -e "${GREEN}✓ Unsloth $UNSLOTH_VERSION already installed${NC}"
+    read -p "Reinstall/update? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+    fi
+else
+    echo "Installing Unsloth..."
+    pip install unsloth
+fi
+
+# Install additional dependencies
+echo -e "\n${YELLOW}[5/6] Installing additional dependencies...${NC}"
+pip install datasets trl transformers accelerate peft -q
+
+# Verify installation
+echo -e "\n${YELLOW}[6/6] Verifying installation...${NC}"
+
+# Test Unsloth
+if python -c "from unsloth import FastLanguageModel; print('OK')" &> /dev/null; then
+    echo -e "${GREEN}✓ Unsloth${NC}"
+else
+    echo -e "${RED}✗ Unsloth${NC}"
+    exit 1
+fi
+
+# Test datasets
+if python -c "import datasets; print('OK')" &> /dev/null; then
+    echo -e "${GREEN}✓ Datasets${NC}"
+else
+    echo -e "${RED}✗ Datasets${NC}"
+    exit 1
+fi
+
+# Test TRL
+if python -c "from trl import SFTTrainer; print('OK')" &> /dev/null; then
+    echo -e "${GREEN}✓ TRL${NC}"
+else
+    echo -e "${RED}✗ TRL${NC}"
+    exit 1
+fi
+
+echo ""
+echo "========================================================================"
+echo -e "${GREEN}✓ SETUP COMPLETE!${NC}"
+echo "========================================================================"
+echo ""
+echo "Next steps:"
+echo "  1. Quick test:  python quickstart.py"
+echo "  2. Full train:  python train_qwen_1b.py"
+echo "  3. Read docs:   cat README.md"
+echo ""
+echo "System summary:"
+echo "  GPU:     $GPU_NAME"
+echo "  VRAM:    $GPU_MEMORY"
+echo "  Python:  $PYTHON_VERSION"
+echo "  PyTorch: $TORCH_VERSION"
+echo "  CUDA:    $CUDA_VERSION"
+echo ""
+echo "Happy training! 🦥"
+echo ""
diff --git a/llm-training/train_qwen_1b.py b/llm-training/train_qwen_1b.py
new file mode 100755
index 000000000..3405fd633
--- /dev/null
+++ b/llm-training/train_qwen_1b.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+Qwen2.5-0.5B/1B Training Script using Unsloth
+==============================================
+
+This script fine-tunes Qwen2.5 1B parameter models using Unsloth for:
+- 2x faster training
+- 70% less VRAM usage
+- Support for QLoRA, LoRA, and full fine-tuning
+
+Requirements:
+- GPU with at least 6GB VRAM (8GB+ recommended)
+- Python 3.9+
+- CUDA toolkit
+
+Model options:
+- unsloth/Qwen2.5-0.5B-bnb-4bit (smallest, 0.5B params)
+- unsloth/Qwen2.5-1.5B-bnb-4bit (1.5B params)
+- Qwen/Qwen2.5-0.5B-Instruct (base 0.5B)
+- Qwen/Qwen2.5-1.5B-Instruct (base 1.5B)
+"""
+
+from unsloth import FastLanguageModel
+import torch
+from datasets import load_dataset
+from trl import SFTTrainer, SFTConfig
+
+# ============================================================================
+# Configuration
+# ============================================================================
+
+# Model Configuration
+MODEL_NAME = "unsloth/Qwen2.5-0.5B-bnb-4bit"  # Change to Qwen2.5-1.5B for larger model
+MAX_SEQ_LENGTH = 2048  # Context window (Qwen supports up to 32K)
+LOAD_IN_4BIT = True    # Use 4-bit quantization (recommended for efficiency)
+LOAD_IN_8BIT = False   # Alternative: 8-bit quantization
+FULL_FINETUNING = False  # Set True for full parameter fine-tuning (requires more VRAM)
+
+# LoRA Configuration (for efficient fine-tuning)
+LORA_R = 16               # LoRA rank (8, 16, 32, 64)
+LORA_ALPHA = 16           # LoRA alpha (typically same as rank)
+LORA_DROPOUT = 0          # LoRA dropout (0 is optimized for Unsloth)
+TARGET_MODULES = [
+    "q_proj", "k_proj", "v_proj", "o_proj",
+    "gate_proj", "up_proj", "down_proj",
+]
+
+# Training Configuration
+OUTPUT_DIR = "./outputs/qwen-finetuned"
+LOGGING_STEPS = 1
+SAVE_STEPS = 100
+MAX_STEPS = 500          # Total training steps (-1 for full dataset)
+PER_DEVICE_BATCH_SIZE = 2
+GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch size = 2 * 4 = 8
+LEARNING_RATE = 2e-4
+WARMUP_STEPS = 10
+WEIGHT_DECAY = 0.01
+LR_SCHEDULER_TYPE = "linear"
+OPTIM = "adamw_8bit"     # 8-bit Adam optimizer for efficiency
+
+# Dataset Configuration
+DATASET_NAME = "yahma/alpaca-cleaned"  # Change to your dataset
+DATASET_TEXT_FIELD = "text"  # Field containing training text
+
+# ============================================================================
+# Load Model and Tokenizer
+# ============================================================================
+
+print(f"Loading model: {MODEL_NAME}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LENGTH,
+    dtype=None,  # Auto-detect (Float16 for Tesla T4, V100, Bfloat16 for Ampere+)
+    load_in_4bit=LOAD_IN_4BIT,
+    load_in_8bit=LOAD_IN_8BIT,
+    # token="hf_...",  # Uncomment if using gated models
+)
+
+# ============================================================================
+# Apply LoRA/PEFT
+# ============================================================================
+
+if not FULL_FINETUNING:
+    print("Applying LoRA for efficient fine-tuning...")
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_R,
+        target_modules=TARGET_MODULES,
+        lora_alpha=LORA_ALPHA,
+        lora_dropout=LORA_DROPOUT,
+        bias="none",  # "none" is optimized
+        use_gradient_checkpointing="unsloth",  # Use Unsloth's optimized checkpointing
+        random_state=3407,
+        max_seq_length=MAX_SEQ_LENGTH,
+        use_rslora=False,  # Rank stabilized LoRA
+        loftq_config=None,  # LoftQ quantization
+    )
+
+# ============================================================================
+# Prepare Dataset
+# ============================================================================
+
+print(f"Loading dataset: {DATASET_NAME}")
+
+# Load dataset
+dataset = load_dataset(DATASET_NAME, split="train")
+
+# Format dataset for Alpaca-style prompts
+alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{}
+
+### Input:
+{}
+
+### Response:
+{}"""
+
+def formatting_prompts_func(examples):
+    """Format examples into Alpaca prompt template"""
+    instructions = examples.get("instruction", [""] * len(examples["instruction"]))
+    inputs = examples.get("input", [""] * len(examples["instruction"]))
+    outputs = examples.get("output", [""] * len(examples["instruction"]))
+
+    texts = []
+    for instruction, input_text, output in zip(instructions, inputs, outputs):
+        text = alpaca_prompt.format(instruction, input_text, output) + tokenizer.eos_token
+        texts.append(text)
+    return {"text": texts}
+
+# Apply formatting
+dataset = dataset.map(formatting_prompts_func, batched=True)
+
+# ============================================================================
+# Training
+# ============================================================================
+
+print("Starting training...")
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+    args=SFTConfig(
+        # Training parameters
+        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
+        warmup_steps=WARMUP_STEPS,
+        max_steps=MAX_STEPS,
+        learning_rate=LEARNING_RATE,
+        weight_decay=WEIGHT_DECAY,
+        lr_scheduler_type=LR_SCHEDULER_TYPE,
+
+        # Logging and saving
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        output_dir=OUTPUT_DIR,
+
+        # Optimization
+        optim=OPTIM,
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        seed=3407,
+
+        # Dataset configuration
+        max_seq_length=MAX_SEQ_LENGTH,
+        dataset_text_field=DATASET_TEXT_FIELD,
+        packing=False,  # Can pack short sequences for efficiency
+
+        # Report to (optional)
+        # report_to=["tensorboard"],  # or "wandb"
+    ),
+)
+
+# Train the model
+trainer.train()
+
+# ============================================================================
+# Save Model
+# ============================================================================
+
+print("Saving model...")
+
+# Save LoRA adapter
+model.save_pretrained(f"{OUTPUT_DIR}/lora_model")
+tokenizer.save_pretrained(f"{OUTPUT_DIR}/lora_model")
+
+# Save merged model (LoRA + base model)
+model.save_pretrained_merged(
+    f"{OUTPUT_DIR}/merged_model",
+    tokenizer,
+    save_method="merged_16bit",  # or "merged_4bit", "lora"
+)
+
+print(f"Model saved to {OUTPUT_DIR}")
+
+# ============================================================================
+# Inference Example
+# ============================================================================
+
+print("\nTesting inference...")
+
+# Enable inference mode
+FastLanguageModel.for_inference(model)
+
+# Test prompt
+test_prompt = alpaca_prompt.format(
+    "Write a Python function to calculate factorial",  # instruction
+    "",  # input
+    "",  # output (to be generated)
+)
+
+inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
+
+# Generate
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+    temperature=0.7,
+    top_p=0.9,
+    do_sample=True,
+)
+
+result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print("\n" + "="*80)
+print("GENERATED OUTPUT:")
+print("="*80)
+print(result)
+print("="*80)
+
+print("\nTraining complete!")
+print(f"\nTo use your model:")
+print(f"1. LoRA adapter: {OUTPUT_DIR}/lora_model")
+print(f"2. Merged model: {OUTPUT_DIR}/merged_model")