Set ollama:qwen2 as default model throughout the application

jasperan · jasperan · commit aab4bf4d824c · 2025-04-10T21:27:54.000+02:00
diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
@@ -59,7 +59,15 @@ def load_config():
 openai_key = os.getenv("OPENAI_API_KEY")
 
 # Initialize agents with use_cot=True to ensure CoT is available
-local_agent = LocalRAGAgent(vector_store, use_cot=True) if hf_token else None
+# Default to Ollama qwen2, fall back to Mistral if available
+try:
+    local_agent = LocalRAGAgent(vector_store, model_name="ollama:qwen2", use_cot=True)
+    print("Using Ollama qwen2 as default model")
+except Exception as e:
+    print(f"Could not initialize Ollama qwen2: {str(e)}")
+    local_agent = LocalRAGAgent(vector_store, use_cot=True) if hf_token else None
+    print("Falling back to Local Mistral model" if hf_token else "No local model available")
+    
 openai_agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=True) if openai_key else None
 
 def process_pdf(file: tempfile._TemporaryFileWrapper) -> str:
@@ -313,20 +321,23 @@ def create_interface():
         if openai_key:
             model_choices.append("OpenAI")
         
+        # Set default model to Ollama - qwen2
+        default_model = "Ollama - qwen2"
+        
         # Model Management Tab (First Tab)
         with gr.Tab("Model Management"):
             gr.Markdown("""
             ## Model Management
             
             Download models in advance to prepare them for use in the chat interface.
             
-            ### Hugging Face Models (Default)
+            ### Hugging Face Models
             
-            The system uses Mistral-7B by default. For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file.
+            For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file.
             
-            ### Ollama Models (Alternative)
+            ### Ollama Models (Default)
             
-            Ollama models are available as alternatives. For Ollama models, this will pull the model using the Ollama client. 
+            Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client.
             Make sure Ollama is installed and running on your system.
             You can download Ollama from [ollama.com/download](https://ollama.com/download)
             """)
@@ -335,7 +346,7 @@ def create_interface():
                 with gr.Column():
                     model_dropdown = gr.Dropdown(
                         choices=model_choices,
-                        value=model_choices[0] if model_choices else None,
+                        value=default_model if default_model in model_choices else model_choices[0] if model_choices else None,
                         label="Select Model to Download",
                         interactive=True
                     )
@@ -350,6 +361,21 @@ def create_interface():
                     gr.Markdown("""
                     ### Model Information
                     
+                    **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama.
+                    - Size: ~4GB
+                    - Requires Ollama to be installed and running
+                    - High-quality model with good performance
+                    
+                    **Ollama - llama3**: Meta's Llama 3 model via Ollama.
+                    - Size: ~4GB
+                    - Requires Ollama to be installed and running
+                    - Excellent performance and quality
+                    
+                    **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama.
+                    - Size: ~4GB
+                    - Requires Ollama to be installed and running
+                    - Efficient small model with good performance
+                    
                     **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model.
                     - Size: ~14GB
                     - VRAM Required: ~8GB
@@ -364,21 +390,6 @@ def create_interface():
                     - Size: ~7GB
                     - VRAM Required: ~6GB
                     - Balance between quality and memory usage
-                    
-                    **Ollama - llama3**: Meta's Llama 3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Excellent performance and quality
-                    
-                    **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Efficient small model with good performance
-                    
-                    **Ollama - qwen2**: Alibaba's Qwen2 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - High-quality model with good performance
                     """)
         
         # Document Processing Tab
@@ -412,7 +423,7 @@ def create_interface():
                 with gr.Column(scale=1):
                     standard_agent_dropdown = gr.Dropdown(
                         choices=model_choices,
-                        value=model_choices[0] if model_choices else None,
+                        value=default_model if default_model in model_choices else model_choices[0] if model_choices else None,
                         label="Select Agent"
                     )
                 with gr.Column(scale=1):
@@ -441,7 +452,7 @@ def create_interface():
                 with gr.Column(scale=1):
                     cot_agent_dropdown = gr.Dropdown(
                         choices=model_choices,
-                        value=model_choices[0] if model_choices else None,
+                        value=default_model if default_model in model_choices else model_choices[0] if model_choices else None,
                         label="Select Agent"
                     )
                 with gr.Column(scale=1):
@@ -536,7 +547,7 @@ def create_interface():
         
         2. **Standard Chat Interface**:
            - Quick responses without detailed reasoning steps
-           - Select your preferred agent (Local Mistral or OpenAI)
+           - Select your preferred agent (Ollama qwen2 by default)
            - Select which knowledge collection to query:
              - **PDF Collection**: Always searches PDF documents
              - **Repository Collection**: Always searches code repositories
@@ -551,19 +562,36 @@ def create_interface():
            - Same collection selection options as the Standard Chat Interface
         
         4. **Performance Expectations**:
+           - **Ollama models**: Typically faster inference, default is qwen2
            - **Local (Mistral) model**: Initial loading takes 1-5 minutes, each query takes 30-60 seconds
-           - **OpenAI model**: Much faster responses, typically a few seconds per query
-           - Chain of Thought reasoning takes longer for both models
+           - **OpenAI model**: Fast responses, typically a few seconds per query
+           - Chain of Thought reasoning takes longer for all models
         
-        Note: OpenAI agent requires an API key in `.env` file
+        Note: The interface will automatically detect available models based on your configuration:
+        - Ollama models are the default option (requires Ollama to be installed and running)
+        - Local Mistral model requires HuggingFace token in `config.yaml` (fallback option)
+        - OpenAI model requires API key in `.env` file
         """)
     
     return interface
 
 def main():
     # Check configuration
+    try:
+        import ollama
+        try:
+            # Check if Ollama is running and qwen2 is available
+            models = ollama.list().models
+            available_models = [model.model for model in models]
+            if "qwen2" not in available_models and "qwen2:latest" not in available_models:
+                print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.")
+        except Exception:
+            print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.")
+    except ImportError:
+        print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama")
+        
     if not hf_token and not openai_key:
-        print("⚠️ Warning: Neither HuggingFace token nor OpenAI key found. Please configure at least one.")
+        print("⚠️ Warning: Neither HuggingFace token nor OpenAI key found. Using Ollama only.")
     
     # Launch interface
     interface = create_interface()
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
@@ -500,19 +500,18 @@ def _generate_general_response(self, query: str) -> Dict[str, Any]:
         }
 
 def main():
-    parser = argparse.ArgumentParser(description="Query documents using local Mistral model")
-    parser.add_argument("--query", required=True, help="Query to process")
-    parser.add_argument("--store-path", default="embeddings", help="Path to the vector store")
-    parser.add_argument("--model", default="mistralai/Mistral-7B-Instruct-v0.2", help="Model to use")
-    parser.add_argument("--quiet", action="store_true", help="Disable verbose logging")
-    parser.add_argument("--use-cot", action="store_true", help="Enable Chain of Thought reasoning")
-    parser.add_argument("--collection", choices=["PDF Collection", "Repository Collection", "General Knowledge", "Web Knowledge Base"], 
-                        help="Specify which collection to query")
-    parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis step")
+    parser = argparse.ArgumentParser(description="Query documents using local LLM")
+    parser.add_argument("--query", required=True, help="Query to search for")
+    parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use")
+    parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)")
+    parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)")
+    parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning")
+    parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store")
+    parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis (not recommended)")
     parser.add_argument("--verbose", action="store_true", help="Show full content of sources")
-    parser.add_argument("--embeddings", choices=["oracle", "chromadb"], default="oracle", 
-                        help="Select embeddings backend (default: oracle)")
-    
+    parser.add_argument("--quiet", action="store_true", help="Disable verbose logging")
+    parser.add_argument("--quantization", choices=["4bit", "8bit"], help="Quantization method (4bit or 8bit)")
+        
     args = parser.parse_args()
     
     # Set logging level based on quiet flag