feat: added model downloads chapter and new available GGUF and quantized bitsandbytes models

jasperan · jasperan · commit 1e8b9dcee4b3 · 2025-03-07T22:26:18.000+01:00
diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
@@ -5,6 +5,8 @@
 import tempfile
 from dotenv import load_dotenv
 import yaml
+import torch
+import time
 
 from pdf_processor import PDFProcessor
 from web_processor import WebProcessor
@@ -89,19 +91,49 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
         # Skip analysis for General Knowledge or when using standard chat interface (not CoT)
         skip_analysis = collection == "General Knowledge" or not use_cot
         
+        # Parse agent type to determine model and quantization
+        quantization = None
+        model_name = None
+        
+        if "4-bit" in agent_type:
+            quantization = "4bit"
+            model_type = "Local (Mistral)"
+        elif "8-bit" in agent_type:
+            quantization = "8bit"
+            model_type = "Local (Mistral)"
+        elif "GGUF" in agent_type:
+            model_type = "GGUF"
+            # Extract model name from agent_type
+            if "Phi-4-mini" in agent_type:
+                model_name = "unsloth/Phi-4-mini-instruct-GGUF"
+            elif "Qwen_QwQ-32B" in agent_type:
+                model_name = "bartowski/Qwen_QwQ-32B-GGUF"
+            elif "TinyR1-32B" in agent_type:
+                model_name = "bartowski/qihoo360_TinyR1-32B-Preview-GGUF"
+        else:
+            model_type = agent_type
+        
         # Select appropriate agent and reinitialize with correct settings
-        if agent_type == "Local (Mistral)":
+        if "Local" in model_type or model_type == "GGUF":
             if not hf_token:
                 response_text = "Local agent not available. Please check your HuggingFace token configuration."
                 print(f"Error: {response_text}")
                 return history + [[message, response_text]]
-            agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis)
+            
+            # Use specified model_name for GGUF models, otherwise use default
+            if model_type == "GGUF" and model_name:
+                agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
+                                     collection=collection, skip_analysis=skip_analysis)
+            else:
+                agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
+                                     skip_analysis=skip_analysis, quantization=quantization)
         else:
             if not openai_key:
                 response_text = "OpenAI agent not available. Please check your OpenAI API key configuration."
                 print(f"Error: {response_text}")
                 return history + [[message, response_text]]
-            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis)
+            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
+                            collection=collection, skip_analysis=skip_analysis)
         
         # Process query and get response
         print("Processing query...")
@@ -183,6 +215,83 @@ def create_interface():
         > **Note on Performance**: When using the Local (Mistral) model, initial loading can take 1-5 minutes, and each query may take 30-60 seconds to process depending on your hardware. OpenAI queries are typically much faster.
         """)
         
+        # Create model choices list for reuse
+        model_choices = []
+        if hf_token:
+            model_choices.extend([
+                "Local (Mistral)", 
+                "Local (Mistral) - 4-bit Quantized",
+                "Local (Mistral) - 8-bit Quantized",
+                "GGUF - Phi-4-mini-instruct",
+                "GGUF - Qwen_QwQ-32B",
+                "GGUF - TinyR1-32B-Preview"
+            ])
+        if openai_key:
+            model_choices.append("OpenAI")
+        
+        # Model Management Tab (First Tab)
+        with gr.Tab("Model Management"):
+            gr.Markdown("""
+            ## Model Management
+            
+            Download models in advance to prepare them for use in the chat interface.
+            This can help avoid delays when first using a model and ensure all models are properly downloaded.
+            
+            > **Note**: Some models may require accepting terms on the Hugging Face website before downloading.
+            > If you encounter a 401 error, please follow the link provided to accept the model terms.
+            """)
+            
+            with gr.Row():
+                with gr.Column():
+                    model_dropdown = gr.Dropdown(
+                        choices=model_choices,
+                        value=model_choices[0] if model_choices else None,
+                        label="Select Model to Download",
+                        interactive=True
+                    )
+                    download_button = gr.Button("Download Selected Model")
+                    model_status = gr.Textbox(
+                        label="Download Status",
+                        placeholder="Select a model and click Download to begin...",
+                        interactive=False
+                    )
+                
+                with gr.Column():
+                    gr.Markdown("""
+                    ### Model Information
+                    
+                    **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model.
+                    - Size: ~14GB
+                    - VRAM Required: ~8GB
+                    - Good balance of quality and speed
+                    
+                    **Local (Mistral) - 4-bit Quantized**: 4-bit quantized version of Mistral-7B.
+                    - Size: ~4GB
+                    - VRAM Required: ~4GB
+                    - Faster inference with minimal quality loss
+                    
+                    **Local (Mistral) - 8-bit Quantized**: 8-bit quantized version of Mistral-7B.
+                    - Size: ~7GB
+                    - VRAM Required: ~6GB
+                    - Balance between quality and memory usage
+                    
+                    **GGUF - Phi-4-mini-instruct**: Microsoft's Phi-4-mini model in GGUF format.
+                    - Size: ~2-4GB
+                    - VRAM Required: Scales based on available VRAM
+                    - Efficient small model with good performance
+                    
+                    **GGUF - Qwen_QwQ-32B**: Qwen 32B model in GGUF format.
+                    - Size: ~20GB
+                    - VRAM Required: Scales based on available VRAM
+                    - High-quality large model
+                    
+                    **GGUF - TinyR1-32B-Preview**: Qihoo 360's TinyR1 32B model in GGUF format.
+                    - Size: ~20GB
+                    - VRAM Required: Scales based on available VRAM
+                    - High-quality large model
+                    """)
+        
+        # Document Processing Tab
         with gr.Tab("Document Processing"):
             with gr.Row():
                 with gr.Column():
@@ -203,9 +312,10 @@ def create_interface():
         with gr.Tab("Standard Chat Interface"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    # Create model choices with quantization options
                     standard_agent_dropdown = gr.Dropdown(
-                        choices=["Local (Mistral)", "OpenAI"] if openai_key else ["Local (Mistral)"],
-                        value="Local (Mistral)",
+                        choices=model_choices,
+                        value=model_choices[0] if model_choices else None,
                         label="Select Agent"
                     )
                 with gr.Column(scale=1):
@@ -230,9 +340,10 @@ def create_interface():
         with gr.Tab("Chain of Thought Chat Interface"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    # Create model choices with quantization options
                     cot_agent_dropdown = gr.Dropdown(
-                        choices=["Local (Mistral)", "OpenAI"] if openai_key else ["Local (Mistral)"],
-                        value="Local (Mistral)",
+                        choices=model_choices,
+                        value=model_choices[0] if model_choices else None,
                         label="Select Agent"
                     )
                 with gr.Column(scale=1):
@@ -260,6 +371,9 @@ def create_interface():
         url_button.click(process_url, inputs=[url_input], outputs=[url_output])
         repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output])
         
+        # Model download event handler
+        download_button.click(download_model, inputs=[model_dropdown], outputs=[model_status])
+        
         # Standard chat handlers
         standard_msg.submit(
             chat,
@@ -359,5 +473,123 @@ def main():
         inbrowser=True
     )
 
+def download_model(model_type: str) -> str:
+    """Download a model and return status message"""
+    try:
+        print(f"Downloading model: {model_type}")
+        
+        # Parse model type to determine model and quantization
+        quantization = None
+        model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Default model
+        
+        if "4-bit" in model_type:
+            quantization = "4bit"
+        elif "8-bit" in model_type:
+            quantization = "8bit"
+        elif "GGUF" in model_type:
+            # Extract model name from model_type
+            if "Phi-4-mini" in model_type:
+                model_name = "unsloth/Phi-4-mini-instruct-GGUF"
+            elif "Qwen_QwQ-32B" in model_type:
+                model_name = "bartowski/Qwen_QwQ-32B-GGUF"
+            elif "TinyR1-32B" in model_type:
+                model_name = "bartowski/qihoo360_TinyR1-32B-Preview-GGUF"
+        
+        # Check if HuggingFace token is available
+        if not hf_token:
+            return "❌ Error: HuggingFace token not found in config.yaml. Please add your token first."
+        
+        # Start download timer
+        start_time = time.time()
+        
+        # For GGUF models, use the GGUFModelHandler to download
+        if "GGUF" in model_type:
+            try:
+                from local_rag_agent import GGUFModelHandler
+                from huggingface_hub import list_repo_files
+                
+                # Extract repo_id
+                parts = model_name.split('/')
+                repo_id = '/'.join(parts[:2])
+                
+                # Check if model is gated
+                try:
+                    files = list_repo_files(repo_id, token=hf_token)
+                    gguf_files = [f for f in files if f.endswith('.gguf')]
+                    
+                    if not gguf_files:
+                        return f"❌ Error: No GGUF files found in repo: {repo_id}"
+                    
+                    # Download the model
+                    handler = GGUFModelHandler(model_name)
+                    
+                    # Calculate download time
+                    download_time = time.time() - start_time
+                    return f"✅ Successfully downloaded {model_type} in {download_time:.1f} seconds."
+                    
+                except Exception as e:
+                    if "401" in str(e):
+                        return f"❌ Error: This model is gated. Please accept the terms on the Hugging Face website: https://huggingface.co/{repo_id}"
+                    else:
+                        return f"❌ Error downloading model: {str(e)}"
+            
+            except ImportError:
+                return "❌ Error: llama-cpp-python not installed. Please install with: pip install llama-cpp-python"
+        
+        # For Transformers models, use the Transformers library
+        else:
+            try:
+                from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+                
+                # Download tokenizer first (smaller download to check access)
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+                except Exception as e:
+                    if "401" in str(e):
+                        return f"❌ Error: This model is gated. Please accept the terms on the Hugging Face website: https://huggingface.co/{model_name}"
+                    else:
+                        return f"❌ Error downloading tokenizer: {str(e)}"
+                
+                # Set up model loading parameters
+                model_kwargs = {
+                    "token": hf_token,
+                    "device_map": None,  # Don't load on GPU for download only
+                }
+                
+                # Apply quantization if specified
+                if quantization == '4bit':
+                    try:
+                        quantization_config = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_compute_dtype=torch.float16,
+                            bnb_4bit_use_double_quant=True,
+                            bnb_4bit_quant_type="nf4"
+                        )
+                        model_kwargs["quantization_config"] = quantization_config
+                    except ImportError:
+                        return "❌ Error: bitsandbytes not installed. Please install with: pip install bitsandbytes>=0.41.0"
+                elif quantization == '8bit':
+                    try:
+                        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+                        model_kwargs["quantization_config"] = quantization_config
+                    except ImportError:
+                        return "❌ Error: bitsandbytes not installed. Please install with: pip install bitsandbytes>=0.41.0"
+                
+                # Download model (but don't load it fully to save memory)
+                AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    **model_kwargs
+                )
+                
+                # Calculate download time
+                download_time = time.time() - start_time
+                return f"✅ Successfully downloaded {model_type} in {download_time:.1f} seconds."
+                
+            except Exception as e:
+                return f"❌ Error downloading model: {str(e)}"
+    
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+
 if __name__ == "__main__":
     main()