diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
index 50dd22e..7c39376 100644
--- a/agentic_rag/gradio_app.py
+++ b/agentic_rag/gradio_app.py
@@ -140,20 +140,22 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
         elif "8-bit" in agent_type:
             quantization = "8bit"
             model_type = "Local (Mistral)"
-        elif "Ollama" in agent_type:
-            model_type = "Ollama"
-            # Extract model name from agent_type and use correct Ollama model names
-            if "llama3" in agent_type.lower():
-                model_name = "ollama:llama3"
-            elif "phi-3" in agent_type.lower():
-                model_name = "ollama:phi3"
-            elif "qwen2" in agent_type.lower():
-                model_name = "ollama:qwen2"
+        elif agent_type == "openai":
+            model_type = "OpenAI"
         else:
-            model_type = agent_type
+            # All other models are treated as Ollama models
+            model_type = "Ollama"
+            model_name = agent_type
         
         # Select appropriate agent and reinitialize with correct settings
-        if "Local" in model_type:
+        if model_type == "OpenAI":
+            if not openai_key:
+                response_text = "OpenAI key not found. Please check your config."
+                print(f"Error: {response_text}")
+                return history + [[message, response_text]]
+            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
+                            collection=collection, skip_analysis=skip_analysis)
+        elif model_type == "Local (Mistral)":
             # For HF models, we need the token
             if not hf_token:
                 response_text = "Local agent not available. Please check your HuggingFace token configuration."
@@ -161,32 +163,14 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
                 return history + [[message, response_text]]
             agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
                                  skip_analysis=skip_analysis, quantization=quantization)
-        elif model_type == "Ollama":
-            # For Ollama models
-            if model_name:
-                try:
-                    agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
-                                         collection=collection, skip_analysis=skip_analysis)
-                except Exception as e:
-                    response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral."
-                    print(f"Error: {response_text}")
-                    # Fall back to Mistral if Ollama fails
-                    if hf_token:
-                        agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, 
-                                             skip_analysis=skip_analysis)
-                    else:
-                        return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]]
-            else:
-                response_text = "Ollama model not specified correctly."
-                print(f"Error: {response_text}")
-                return history + [[message, response_text]]
-        else:
-            if not openai_key:
-                response_text = "OpenAI agent not available. Please check your OpenAI API key configuration."
+        else:  # Ollama models
+            try:
+                agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, 
+                                     collection=collection, skip_analysis=skip_analysis)
+            except Exception as e:
+                response_text = f"Error initializing Ollama model: {str(e)}"
                 print(f"Error: {response_text}")
                 return history + [[message, response_text]]
-            agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, 
-                            collection=collection, skip_analysis=skip_analysis)
         
         # Process query and get response
         print("Processing query...")
@@ -305,50 +289,37 @@ def create_interface():
         
         # Create model choices list for reuse
         model_choices = []
-        # HF models first if token is available
-        if hf_token:
-            model_choices.extend([
-                "Local (Mistral)", 
-                "Local (Mistral) - 4-bit Quantized",
-                "Local (Mistral) - 8-bit Quantized",
-            ])
-        # Then Ollama models (don't require HF token)
+        # Only Ollama models (no more local Mistral deployments)
         model_choices.extend([
-            "Ollama - llama3",
-            "Ollama - phi-3",
-            "Ollama - qwen2"
+            "qwq",
+            "gemma3",
+            "llama3.3",
+            "phi4",
+            "mistral",
+            "llava",
+            "phi3",
+            "deepseek-r1"
         ])
         if openai_key:
-            model_choices.append("OpenAI")
+            model_choices.append("openai")
         
-        # Set default model to Ollama - qwen2
-        default_model = "Ollama - qwen2"
+        # Set default model to qwq
+        default_model = "qwq"
         
         # Model Management Tab (First Tab)
         with gr.Tab("Model Management"):
             gr.Markdown("""
-            ## Model Management
-            
-            Download models in advance to prepare them for use in the chat interface.
-            
-            ### Hugging Face Models
-            
-            For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file.
-            
-            ### Ollama Models (Default)
-            
-            Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client.
-            Make sure Ollama is installed and running on your system.
-            You can download Ollama from [ollama.com/download](https://ollama.com/download)
+            ## Model Selection
+            Choose your preferred model for the conversation.
             """)
             
             with gr.Row():
                 with gr.Column():
                     model_dropdown = gr.Dropdown(
                         choices=model_choices,
-                        value=default_model if default_model in model_choices else model_choices[0] if model_choices else None,
-                        label="Select Model to Download",
-                        interactive=True
+                        value=default_model,
+                        label="Select Model",
+                        info="Choose the model to use for the conversation"
                     )
                     download_button = gr.Button("Download Selected Model")
                     model_status = gr.Textbox(
@@ -356,41 +327,24 @@ def create_interface():
                         placeholder="Select a model and click Download to begin...",
                         interactive=False
                     )
-                
-                with gr.Column():
-                    gr.Markdown("""
-                    ### Model Information
-                    
-                    **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - High-quality model with good performance
-                    
-                    **Ollama - llama3**: Meta's Llama 3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Excellent performance and quality
-                    
-                    **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama.
-                    - Size: ~4GB
-                    - Requires Ollama to be installed and running
-                    - Efficient small model with good performance
-                    
-                    **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model.
-                    - Size: ~14GB
-                    - VRAM Required: ~8GB
-                    - Good balance of quality and speed
-                    
-                    **Local (Mistral) - 4-bit Quantized**: 4-bit quantized version of Mistral-7B.
-                    - Size: ~4GB
-                    - VRAM Required: ~4GB
-                    - Faster inference with minimal quality loss
-                    
-                    **Local (Mistral) - 8-bit Quantized**: 8-bit quantized version of Mistral-7B.
-                    - Size: ~7GB
-                    - VRAM Required: ~6GB
-                    - Balance between quality and memory usage
-                    """)
+            
+            # Add model FAQ section
+            gr.Markdown("""
+            ## Model FAQ
+            
+            | Model | Parameters | Size | Download Command |
+            |-------|------------|------|------------------|
+            | qwq | 32B | 20GB | qwq:latest |
+            | gemma3 | 4B | 3.3GB | gemma3:latest |
+            | llama3.3 | 70B | 43GB | llama3.3:latest |
+            | phi4 | 14B | 9.1GB | phi4:latest |
+            | mistral | 7B | 4.1GB | mistral:latest |
+            | llava | 7B | 4.5GB | llava:latest |
+            | phi3 | 4B | 4.0GB | phi3:latest |
+            | deepseek-r1 | 7B | 4.7GB | deepseek-r1:latest |
+            
+            Note: All models are available through Ollama. Make sure Ollama is running on your system.
+            """)
         
         # Document Processing Tab
         with gr.Tab("Document Processing"):
@@ -580,13 +534,30 @@ def main():
     try:
         import ollama
         try:
-            # Check if Ollama is running and qwen2 is available
+            # Check if Ollama is running and list available models
             models = ollama.list().models
             available_models = [model.model for model in models]
-            if "qwen2" not in available_models and "qwen2:latest" not in available_models:
-                print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.")
-        except Exception:
-            print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.")
+            
+            # Check if any default models are available
+            if "qwen2" not in available_models and "qwen2:latest" not in available_models and \
+               "llama3" not in available_models and "llama3:latest" not in available_models and \
+               "phi3" not in available_models and "phi3:latest" not in available_models:
+                print("⚠️ Warning: Ollama is running but no default models (qwen2, llama3, phi3) are available.")
+                print("Please download a model through the Model Management tab or run:")
+                print("    ollama pull qwen2")
+                print("    ollama pull llama3")
+                print("    ollama pull phi3")
+            else:
+                available_default_models = []
+                for model in ["qwen2", "llama3", "phi3"]:
+                    if model in available_models or f"{model}:latest" in available_models:
+                        available_default_models.append(model)
+                
+                print(f"✅ Ollama is running with available default models: {', '.join(available_default_models)}")
+                print(f"All available models: {', '.join(available_models)}")
+        except Exception as e:
+            print(f"⚠️ Warning: Ollama is installed but not running or encountered an error: {str(e)}")
+            print("Please start Ollama before using the interface.")
     except ImportError:
         print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama")
         
@@ -674,17 +645,11 @@ def download_model(model_type: str) -> str:
                 
             except Exception as e:
                 return f"❌ Error downloading model: {str(e)}"
-                
-        elif "Ollama" in model_type:
+        # all ollama models
+        else:
             # Extract model name from model_type
-            if "llama3" in model_type.lower():
-                model_name = "llama3"
-            elif "phi-3" in model_type.lower():
-                model_name = "phi3"
-            elif "qwen2" in model_type.lower():
-                model_name = "qwen2"
-            else:
-                return "❌ Error: Unknown Ollama model type"
+            # Remove the 'Ollama - ' prefix and any leading/trailing whitespace
+            model_name = model_type.replace("Ollama - ", "").strip()
             
             # Use Ollama to pull the model
             try:
@@ -732,8 +697,6 @@ def download_model(model_type: str) -> str:
                 return "❌ Error: Could not connect to Ollama. Please make sure Ollama is installed and running."
             except Exception as e:
                 return f"❌ Error pulling Ollama model: {str(e)}"
-        else:
-            return "❌ Error: Unknown model type"
     
     except Exception as e:
         return f"❌ Error: {str(e)}"
diff --git a/agentic_rag/k8s/MINIKUBE.md b/agentic_rag/k8s/MINIKUBE.md
new file mode 100644
index 0000000..cd48157
--- /dev/null
+++ b/agentic_rag/k8s/MINIKUBE.md
@@ -0,0 +1,210 @@
+# Quick Start with Minikube
+
+This guide provides instructions for deploying the Agentic RAG system on Minikube for local testing.
+
+## Prerequisites
+
+1. [Minikube](https://minikube.sigs.k8s.io/docs/start/) installed
+2. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) installed
+3. Docker or another container runtime installed
+4. NVIDIA GPU with appropriate drivers installed
+5. [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed
+
+## Step 1: Start Minikube with GPU Support
+
+Start Minikube with sufficient resources and GPU support:
+
+```bash
+# For Linux
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=kvm2 --gpu
+
+# For Windows
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperv --gpu
+
+# For macOS (Note: GPU passthrough is limited on macOS)
+minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperkit
+```
+
+Verify that Minikube is running:
+
+```bash
+minikube status
+```
+
+## Step 2: Install NVIDIA Device Plugin
+
+Install the NVIDIA device plugin to enable GPU support in Kubernetes:
+
+```bash
+# Apply the NVIDIA device plugin
+kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
+```
+
+Verify that the GPU is available in the cluster:
+
+```bash
+kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+```
+
+## Step 3: Clone the Repository
+
+Clone the repository containing the Kubernetes manifests:
+
+```bash
+git clone https://github.com/devrel/devrel-labs.git
+cd devrel-labs/agentic_rag/k8s
+```
+
+## Step 4: Deploy the Application
+
+The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+### Option 1: Deploy without a Hugging Face token (Ollama models only)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create an empty ConfigMap
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    # No Hugging Face token provided
+    # You can still use Ollama models
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 2: Deploy with a Hugging Face token (both Mistral and Ollama models)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create ConfigMap with your Hugging Face token
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    HUGGING_FACE_HUB_TOKEN: "your-huggingface-token"
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 3: Using the deployment script
+
+```bash
+# Make the script executable
+chmod +x deploy.sh
+
+# Deploy with a Hugging Face token
+./deploy.sh --hf-token "your-huggingface-token" --namespace agentic-rag
+
+# Or deploy without a Hugging Face token
+./deploy.sh --namespace agentic-rag
+```
+
+## Step 5: Monitor the Deployment
+
+Check the status of your pods:
+
+```bash
+kubectl get pods -n agentic-rag
+```
+
+View the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+## Step 6: Access the Application
+
+For Minikube, you need to use port-forwarding to access the application:
+
+```bash
+kubectl port-forward -n agentic-rag service/agentic-rag 8080:80
+```
+
+Then access the application in your browser at `http://localhost:8080`.
+
+Alternatively, you can use Minikube's service command:
+
+```bash
+minikube service agentic-rag -n agentic-rag
+```
+
+## Troubleshooting
+
+### Insufficient Resources
+
+If pods are stuck in Pending state due to insufficient resources, you can increase Minikube's resources:
+
+```bash
+minikube stop
+minikube start --cpus 6 --memory 16384 --disk-size 50g --driver=kvm2 --gpu
+```
+
+### GPU-Related Issues
+
+If you encounter GPU-related issues:
+
+1. **Check GPU availability in Minikube**:
+   ```bash
+   minikube ssh -- nvidia-smi
+   ```
+
+2. **Verify NVIDIA device plugin is running**:
+   ```bash
+   kubectl get pods -n kube-system | grep nvidia-device-plugin
+   ```
+
+3. **Check if GPU is available to Kubernetes**:
+   ```bash
+   kubectl describe nodes | grep nvidia.com/gpu
+   ```
+
+### Slow Model Download
+
+The first time you deploy, the models will be downloaded, which can take some time. You can check the progress in the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+### Service Not Accessible
+
+If you can't access the service, make sure port-forwarding is running or try using the Minikube service command.
+
+## Cleanup
+
+To remove all resources:
+
+```bash
+kubectl delete namespace agentic-rag
+```
+
+To stop Minikube:
+
+```bash
+minikube stop
+```
+
+To delete the Minikube cluster:
+
+```bash
+minikube delete
+``` 
\ No newline at end of file
diff --git a/agentic_rag/k8s/OKE_DEPLOYMENT.md b/agentic_rag/k8s/OKE_DEPLOYMENT.md
new file mode 100644
index 0000000..5866606
--- /dev/null
+++ b/agentic_rag/k8s/OKE_DEPLOYMENT.md
@@ -0,0 +1,246 @@
+# Deploying Agentic RAG on Oracle Kubernetes Engine (OKE)
+
+This guide provides detailed instructions for deploying the Agentic RAG system on Oracle Kubernetes Engine (OKE).
+
+## Prerequisites
+
+1. Access to an Oracle Cloud Infrastructure (OCI) account
+2. OKE cluster created and configured
+3. `kubectl` installed and configured to connect to your OKE cluster
+4. OCI CLI installed and configured (optional but recommended)
+5. GPU-enabled node pool in your OKE cluster
+
+## Step 1: Create a GPU-enabled Node Pool
+
+If you don't already have a GPU-enabled node pool in your OKE cluster, you'll need to create one:
+
+1. Navigate to the OKE cluster in the OCI Console
+2. Click on "Add Node Pool"
+3. Configure the node pool:
+   - Name: `gpu-pool`
+   - Shape: Select a GPU-enabled shape (e.g., `VM.GPU2.1`, `VM.GPU3.1`, or `BM.GPU4.8`)
+   - Image: Select an Oracle Linux image
+   - Node count: Start with 1-2 nodes
+4. Click "Create"
+
+Wait for the node pool to be created and the nodes to become active.
+
+## Step 2: Install NVIDIA Device Plugin
+
+Install the NVIDIA device plugin to enable GPU support in Kubernetes:
+
+```bash
+# Apply the NVIDIA device plugin
+kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
+```
+
+Verify that the GPU is available in the cluster:
+
+```bash
+kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+```
+
+## Step 3: Clone the Repository
+
+Clone the repository containing the Kubernetes manifests:
+
+```bash
+git clone https://github.com/devrel/devrel-labs.git
+cd devrel-labs/agentic_rag/k8s
+```
+
+## Step 4: Deploy the Application
+
+The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+### Option 1: Deploy without a Hugging Face token (Ollama models only)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create an empty ConfigMap
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    # No Hugging Face token provided
+    # You can still use Ollama models
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 2: Deploy with a Hugging Face token (both Mistral and Ollama models)
+
+```bash
+# Create a namespace
+kubectl create namespace agentic-rag
+
+# Create ConfigMap with your Hugging Face token
+cat <<EOF | kubectl apply -n agentic-rag -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: agentic-rag-config
+data:
+  config.yaml: |
+    HUGGING_FACE_HUB_TOKEN: "your-huggingface-token"
+EOF
+
+# Apply the manifests
+kubectl apply -n agentic-rag -f local-deployment/deployment.yaml
+kubectl apply -n agentic-rag -f local-deployment/service.yaml
+```
+
+### Option 3: Using the deployment script
+
+```bash
+# Make the script executable
+chmod +x deploy.sh
+
+# Deploy with a Hugging Face token
+./deploy.sh --hf-token "your-huggingface-token" --namespace agentic-rag
+
+# Or deploy without a Hugging Face token
+./deploy.sh --namespace agentic-rag
+```
+
+## Step 5: Configure Load Balancer (Optional)
+
+By default, the service is exposed as a LoadBalancer, which will automatically create an OCI Load Balancer. If you want to customize the load balancer:
+
+```bash
+# Edit the service
+kubectl edit service agentic-rag -n agentic-rag
+```
+
+Add annotations for OCI Load Balancer configuration:
+
+```yaml
+metadata:
+  annotations:
+    service.beta.kubernetes.io/oci-load-balancer-shape: "flexible"
+    service.beta.kubernetes.io/oci-load-balancer-shape-flex-min: "10"
+    service.beta.kubernetes.io/oci-load-balancer-shape-flex-max: "100"
+```
+
+## Step 6: Monitor the Deployment
+
+Check the status of your pods:
+
+```bash
+kubectl get pods -n agentic-rag
+```
+
+View the logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag -n agentic-rag
+```
+
+Check GPU allocation:
+
+```bash
+kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources'
+```
+
+## Step 7: Access the Application
+
+Get the external IP of the load balancer:
+
+```bash
+kubectl get service agentic-rag -n agentic-rag
+```
+
+Access the application in your browser at `http://<EXTERNAL-IP>`.
+
+## Troubleshooting
+
+### Pod Stuck in Pending State
+
+If the pod is stuck in Pending state, check the events:
+
+```bash
+kubectl describe pod -l app=agentic-rag -n agentic-rag
+```
+
+Common issues include:
+
+1. **Insufficient resources**: Ensure your node pool has enough resources
+2. **GPU not available**: Ensure your node pool has GPU-enabled nodes
+3. **Image pull issues**: Check if the image can be pulled from the registry
+
+### GPU-Related Issues
+
+If you encounter GPU-related issues:
+
+1. **Check GPU availability in OKE**:
+   ```bash
+   kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
+   ```
+
+2. **Verify NVIDIA device plugin is running**:
+   ```bash
+   kubectl get pods -n kube-system | grep nvidia-device-plugin
+   ```
+
+3. **Check if GPU is available to the pod**:
+   ```bash
+   kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources'
+   ```
+
+4. **Check NVIDIA driver installation on the node**:
+   ```bash
+   # Get the node name
+   NODE_NAME=$(kubectl get pod -l app=agentic-rag -n agentic-rag -o jsonpath='{.items[0].spec.nodeName}')
+   
+   # Create a debug pod on the node
+   kubectl debug node/$NODE_NAME -it --image=ubuntu
+   
+   # Inside the debug pod
+   chroot /host
+   nvidia-smi
+   ```
+
+### Load Balancer Issues
+
+If the load balancer is not provisioning or not accessible:
+
+1. Check the service status:
+   ```bash
+   kubectl get service agentic-rag -n agentic-rag
+   ```
+
+2. Check OCI Console for load balancer status and configuration
+
+3. Ensure your VCN security lists allow traffic to the load balancer
+
+## Scaling
+
+To scale the deployment:
+
+```bash
+kubectl scale deployment agentic-rag -n agentic-rag --replicas=2
+```
+
+Note: Each replica will require its own GPU.
+
+## Cleanup
+
+To remove all resources:
+
+```bash
+kubectl delete namespace agentic-rag
+```
+
+To delete the OCI Load Balancer (if it's not automatically deleted):
+
+1. Navigate to the Load Balancers page in the OCI Console
+2. Find the load balancer created for your service
+3. Click "Delete" and confirm 
\ No newline at end of file
diff --git a/agentic_rag/k8s/README_k8s.md b/agentic_rag/k8s/README_k8s.md
new file mode 100644
index 0000000..a6bc5fd
--- /dev/null
+++ b/agentic_rag/k8s/README_k8s.md
@@ -0,0 +1,95 @@
+# Kubernetes Deployment for Agentic RAG
+
+This directory contains Kubernetes manifests for deploying the Agentic RAG system.
+
+## Prerequisites
+
+- Kubernetes cluster (e.g., Oracle Kubernetes Engine, Minikube, or any other Kubernetes cluster)
+- `kubectl` configured to access your cluster
+- At least 8GB of RAM and 4 CPU cores available for the deployment
+
+## Deployment
+
+This deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models.
+
+1. **Update the ConfigMap with your Hugging Face token** (optional but recommended):
+
+   ```bash
+   # Edit the configmap.yaml file
+   nano local-deployment/configmap.yaml
+   
+   # Replace "your-huggingface-token" with your actual token
+   ```
+
+2. **Deploy the application**:
+
+   ```bash
+   kubectl apply -f local-deployment/configmap.yaml
+   kubectl apply -f local-deployment/deployment.yaml
+   kubectl apply -f local-deployment/service.yaml
+   ```
+
+3. **Access the application**:
+
+   If using LoadBalancer:
+   ```bash
+   kubectl get service agentic-rag
+   ```
+   
+   If using NodePort:
+   ```bash
+   # Get the NodePort
+   kubectl get service agentic-rag
+   
+   # Access the application at http://<node-ip>:<node-port>
+   ```
+
+## Model Selection
+
+The deployment includes both Hugging Face models and Ollama models:
+
+- **Hugging Face Models**: Mistral-7B models (requires token in config.yaml)
+- **Ollama Models**: llama3, phi3, and qwen2 (automatically downloaded during deployment)
+
+You can select which model to use from the Gradio interface after deployment.
+
+## Monitoring and Troubleshooting
+
+### Check pod status:
+
+```bash
+kubectl get pods
+```
+
+### View logs:
+
+```bash
+kubectl logs -f deployment/agentic-rag
+```
+
+### Shell into the pod:
+
+```bash
+kubectl exec -it deployment/agentic-rag -- /bin/bash
+```
+
+## Scaling
+
+For production deployments, consider:
+
+1. Using persistent volumes for data storage
+2. Adjusting resource requests and limits based on your workload
+3. Setting up proper monitoring and logging
+4. Implementing horizontal pod autoscaling
+
+## Cleanup
+
+To remove the deployment:
+
+```bash
+kubectl delete -f local-deployment/
+```
+
+## Future Work
+
+A distributed system deployment that separates the LLM inference system into its own service is planned for future releases. This will allow for better resource allocation and scaling in production environments. 
\ No newline at end of file
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
index c26f99e..a64f4c7 100644
--- a/agentic_rag/local_rag_agent.py
+++ b/agentic_rag/local_rag_agent.py
@@ -57,8 +57,11 @@ def __init__(self, model_name: str):
         Args:
             model_name: Name of the Ollama model to use
         """
-        # Remove the 'ollama:' prefix if present
-        self.model_name = model_name.replace("ollama:", "") if model_name.startswith("ollama:") else model_name
+        # Remove 'ollama:' prefix if present
+        if model_name and model_name.startswith("ollama:"):
+            model_name = model_name.replace("ollama:", "")
+        
+        self.model_name = model_name
         self._check_ollama_running()
     
     def _check_ollama_running(self):
@@ -74,13 +77,11 @@ def _check_ollama_running(self):
                 
                 # Check if the requested model is available
                 if self.model_name not in available_models:
-                    # Try with :latest suffix
-                    if f"{self.model_name}:latest" in available_models:
-                        self.model_name = f"{self.model_name}:latest"
-                        print(f"Using model with :latest suffix: {self.model_name}")
-                    else:
-                        print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}")
-                        print(f"You can pull it with: ollama pull {self.model_name}")
+                    print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}")
+                    print(f"You can pull it with: ollama pull {self.model_name}")
+                    raise ValueError(f"Model '{self.model_name}' not found in Ollama")
+                else:
+                    print(f"Using Ollama model: {self.model_name}")
             except Exception as e:
                 raise ConnectionError(f"Failed to connect to Ollama. Please make sure Ollama is running. Error: {str(e)}")
                 
@@ -92,6 +93,9 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
         try:
             import ollama
             
+            print(f"\nGenerating response with Ollama model: {self.model_name}")
+            print(f"Prompt: {prompt[:100]}...")  # Print first 100 chars of prompt
+            
             # Generate text
             response = ollama.generate(
                 model=self.model_name,
@@ -103,6 +107,8 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
                 }
             )
             
+            print(f"Response generated successfully with {self.model_name}")
+            
             # Format result to match transformers pipeline output
             formatted_result = [{
                 "generated_text": response["response"]
@@ -114,7 +120,7 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw
             raise Exception(f"Failed to generate text with Ollama: {str(e)}")
 
 class LocalRAGAgent:
-    def __init__(self, vector_store: VectorStore = None, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", 
+    def __init__(self, vector_store: VectorStore = None, model_name: str = None, 
                  use_cot: bool = False, collection: str = None, skip_analysis: bool = False,
                  quantization: str = None, use_oracle_db: bool = True):
         """Initialize local RAG agent with vector store and local LLM
@@ -128,6 +134,13 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
             quantization: Quantization method to use (None, '4bit', '8bit')
             use_oracle_db: Whether to use Oracle DB for vector storage (if False, uses ChromaDB)
         """
+        print(f"LocalRAGAgent init - model_name: {model_name}")
+        
+        # Set default model if none provided
+        if model_name is None:
+            model_name = "qwen2"
+            print(f"Using default model: {model_name}")
+        
         # Initialize vector store if not provided
         self.use_oracle_db = use_oracle_db and ORACLE_DB_AVAILABLE
         
@@ -162,106 +175,71 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala
         self.collection = collection
         self.quantization = quantization
         self.model_name = model_name
+        print('Model Name after assignment:', self.model_name)
         # skip_analysis parameter kept for backward compatibility but no longer used
         
-        # Check if this is an Ollama model
-        self.is_ollama = model_name.startswith("ollama:")
+        # Check if this is an Ollama model (anything not Mistral is considered Ollama)
+        self.is_ollama = not (model_name and "mistral" in model_name.lower())
         
         if self.is_ollama:
-            # Extract the actual model name from the prefix
-            ollama_model_name = model_name.replace("ollama:", "")
+            # Remove 'ollama:' prefix if present
+            if model_name and model_name.startswith("ollama:"):
+                model_name = model_name.replace("ollama:", "")
+            
+            # Always append :latest to Ollama model names
+            if not model_name.endswith(":latest"):
+                model_name = f"{model_name}:latest"
             
             # Load Ollama model
             print("\nLoading Ollama model...")
-            print(f"Model: {ollama_model_name}")
+            print(f"Model: {model_name}")
             print("Note: Make sure Ollama is running on your system.")
             
             # Initialize Ollama model handler
-            self.ollama_handler = OllamaModelHandler(ollama_model_name)
+            self.ollama_handler = OllamaModelHandler(model_name)
             
             # Create pipeline-like interface
             self.pipeline = self.ollama_handler
-            
+            print(f"Using Ollama model: {model_name}")
         else:
-            # Load HuggingFace token from config
-            try:
-                with open('config.yaml', 'r') as f:
-                    config = yaml.safe_load(f)
-                token = config.get('HUGGING_FACE_HUB_TOKEN')
-                if not token:
-                    raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml")
-            except Exception as e:
-                raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}")
-            
-            # Load model and tokenizer
-            print("\nLoading model and tokenizer...")
-            print(f"Model: {model_name}")
-            if quantization:
-                print(f"Quantization: {quantization}")
-            print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.")
-            print("Subsequent queries will be faster but may still take 30-60 seconds per response.")
-            
-            # Check if CUDA is available and set appropriate dtype
-            if torch.cuda.is_available():
-                print("CUDA is available. Using GPU acceleration.")
-                dtype = torch.float16
+            # Only initialize Mistral if no model is specified
+            if not model_name:
+                print("\nLoading default model and tokenizer...")
+                print("Model: mistralai/Mistral-7B-Instruct-v0.2")
+                self.model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    load_in_8bit=quantization == "8bit",
+                    load_in_4bit=quantization == "4bit"
+                )
+                self.pipeline = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device_map="auto"
+                )
+                print(f"Using default model: {self.model_name}")
             else:
-                print("CUDA is not available. Using CPU only (this will be slow).")
-                dtype = torch.float32
-            
-            # Set up model loading parameters
-            model_kwargs = {
-                "torch_dtype": dtype,
-                "device_map": "auto",
-                "token": token,
-                "low_cpu_mem_usage": True,
-                "offload_folder": "offload"
-            }
-            
-            # Apply quantization if specified
-            if quantization == '4bit':
-                try:
-                    from transformers import BitsAndBytesConfig
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_4bit=True,
-                        bnb_4bit_compute_dtype=torch.float16,
-                        bnb_4bit_use_double_quant=True,
-                        bnb_4bit_quant_type="nf4"
-                    )
-                    model_kwargs["quantization_config"] = quantization_config
-                    print("Using 4-bit quantization with bitsandbytes")
-                except ImportError:
-                    print("Warning: bitsandbytes not installed. Falling back to standard loading.")
-                    print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes")
-            elif quantization == '8bit':
-                try:
-                    from transformers import BitsAndBytesConfig
-                    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-                    model_kwargs["quantization_config"] = quantization_config
-                    print("Using 8-bit quantization with bitsandbytes")
-                except ImportError:
-                    print("Warning: bitsandbytes not installed. Falling back to standard loading.")
-                    print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes")
-            
-            # Load model with appropriate settings
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                **model_kwargs
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
-            
-            # Create text generation pipeline with optimized settings
-            self.pipeline = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=512,
-                do_sample=True,
-                temperature=0.1,
-                top_p=0.95,
-                device_map="auto"
-            )
-            print("✓ Model loaded successfully")
+                print(f"\nUsing specified model: {model_name}")
+                self.model_name = model_name
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    load_in_8bit=quantization == "8bit",
+                    load_in_4bit=quantization == "4bit"
+                )
+                self.pipeline = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device_map="auto"
+                )
+                print(f"Using specified model: {self.model_name}")
         
         # Create LLM wrapper
         self.llm = LocalLLM(self.pipeline)
@@ -515,7 +493,7 @@ def main():
     parser = argparse.ArgumentParser(description="Query documents using local LLM")
     parser.add_argument("--query", required=True, help="Query to search for")
     parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use")
-    parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)")
+    parser.add_argument("--model", default="qwen2", help="Model to use (default: qwen2)")
     parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)")
     parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning")
     parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store")
@@ -534,6 +512,7 @@ def main():
     
     print("\nInitializing RAG agent...")
     print("=" * 50)
+    print(f"Using model: {args.model}")
     
     try:
         # Determine which vector store to use based on args.embeddings
@@ -560,6 +539,7 @@ def main():
         # Set use_oracle_db based on the actual store type
         use_oracle_db = args.embeddings == "oracle" and isinstance(store, OraDBVectorStore)
         
+        print(f"Creating LocalRAGAgent with model: {args.model}")
         agent = LocalRAGAgent(
             store, 
             model_name=args.model,