diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 50dd22e..7c39376 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -140,20 +140,22 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, elif "8-bit" in agent_type: quantization = "8bit" model_type = "Local (Mistral)" - elif "Ollama" in agent_type: - model_type = "Ollama" - # Extract model name from agent_type and use correct Ollama model names - if "llama3" in agent_type.lower(): - model_name = "ollama:llama3" - elif "phi-3" in agent_type.lower(): - model_name = "ollama:phi3" - elif "qwen2" in agent_type.lower(): - model_name = "ollama:qwen2" + elif agent_type == "openai": + model_type = "OpenAI" else: - model_type = agent_type + # All other models are treated as Ollama models + model_type = "Ollama" + model_name = agent_type # Select appropriate agent and reinitialize with correct settings - if "Local" in model_type: + if model_type == "OpenAI": + if not openai_key: + response_text = "OpenAI key not found. Please check your config." + print(f"Error: {response_text}") + return history + [[message, response_text]] + agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, + collection=collection, skip_analysis=skip_analysis) + elif model_type == "Local (Mistral)": # For HF models, we need the token if not hf_token: response_text = "Local agent not available. Please check your HuggingFace token configuration." @@ -161,32 +163,14 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, return history + [[message, response_text]] agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, skip_analysis=skip_analysis, quantization=quantization) - elif model_type == "Ollama": - # For Ollama models - if model_name: - try: - agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, - collection=collection, skip_analysis=skip_analysis) - except Exception as e: - response_text = f"Error initializing Ollama model: {str(e)}. Falling back to Local Mistral." - print(f"Error: {response_text}") - # Fall back to Mistral if Ollama fails - if hf_token: - agent = LocalRAGAgent(vector_store, use_cot=use_cot, collection=collection, - skip_analysis=skip_analysis) - else: - return history + [[message, "Local Mistral agent not available for fallback. Please check your HuggingFace token configuration."]] - else: - response_text = "Ollama model not specified correctly." - print(f"Error: {response_text}") - return history + [[message, response_text]] - else: - if not openai_key: - response_text = "OpenAI agent not available. Please check your OpenAI API key configuration." + else: # Ollama models + try: + agent = LocalRAGAgent(vector_store, model_name=model_name, use_cot=use_cot, + collection=collection, skip_analysis=skip_analysis) + except Exception as e: + response_text = f"Error initializing Ollama model: {str(e)}" print(f"Error: {response_text}") return history + [[message, response_text]] - agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=use_cot, - collection=collection, skip_analysis=skip_analysis) # Process query and get response print("Processing query...") @@ -305,50 +289,37 @@ def create_interface(): # Create model choices list for reuse model_choices = [] - # HF models first if token is available - if hf_token: - model_choices.extend([ - "Local (Mistral)", - "Local (Mistral) - 4-bit Quantized", - "Local (Mistral) - 8-bit Quantized", - ]) - # Then Ollama models (don't require HF token) + # Only Ollama models (no more local Mistral deployments) model_choices.extend([ - "Ollama - llama3", - "Ollama - phi-3", - "Ollama - qwen2" + "qwq", + "gemma3", + "llama3.3", + "phi4", + "mistral", + "llava", + "phi3", + "deepseek-r1" ]) if openai_key: - model_choices.append("OpenAI") + model_choices.append("openai") - # Set default model to Ollama - qwen2 - default_model = "Ollama - qwen2" + # Set default model to qwq + default_model = "qwq" # Model Management Tab (First Tab) with gr.Tab("Model Management"): gr.Markdown(""" - ## Model Management - - Download models in advance to prepare them for use in the chat interface. - - ### Hugging Face Models - - For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file. - - ### Ollama Models (Default) - - Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client. - Make sure Ollama is installed and running on your system. - You can download Ollama from [ollama.com/download](https://ollama.com/download) + ## Model Selection + Choose your preferred model for the conversation. """) with gr.Row(): with gr.Column(): model_dropdown = gr.Dropdown( choices=model_choices, - value=default_model if default_model in model_choices else model_choices[0] if model_choices else None, - label="Select Model to Download", - interactive=True + value=default_model, + label="Select Model", + info="Choose the model to use for the conversation" ) download_button = gr.Button("Download Selected Model") model_status = gr.Textbox( @@ -356,41 +327,24 @@ def create_interface(): placeholder="Select a model and click Download to begin...", interactive=False ) - - with gr.Column(): - gr.Markdown(""" - ### Model Information - - **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - High-quality model with good performance - - **Ollama - llama3**: Meta's Llama 3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Excellent performance and quality - - **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Efficient small model with good performance - - **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model. - - Size: ~14GB - - VRAM Required: ~8GB - - Good balance of quality and speed - - **Local (Mistral) - 4-bit Quantized**: 4-bit quantized version of Mistral-7B. - - Size: ~4GB - - VRAM Required: ~4GB - - Faster inference with minimal quality loss - - **Local (Mistral) - 8-bit Quantized**: 8-bit quantized version of Mistral-7B. - - Size: ~7GB - - VRAM Required: ~6GB - - Balance between quality and memory usage - """) + + # Add model FAQ section + gr.Markdown(""" + ## Model FAQ + + | Model | Parameters | Size | Download Command | + |-------|------------|------|------------------| + | qwq | 32B | 20GB | qwq:latest | + | gemma3 | 4B | 3.3GB | gemma3:latest | + | llama3.3 | 70B | 43GB | llama3.3:latest | + | phi4 | 14B | 9.1GB | phi4:latest | + | mistral | 7B | 4.1GB | mistral:latest | + | llava | 7B | 4.5GB | llava:latest | + | phi3 | 4B | 4.0GB | phi3:latest | + | deepseek-r1 | 7B | 4.7GB | deepseek-r1:latest | + + Note: All models are available through Ollama. Make sure Ollama is running on your system. + """) # Document Processing Tab with gr.Tab("Document Processing"): @@ -580,13 +534,30 @@ def main(): try: import ollama try: - # Check if Ollama is running and qwen2 is available + # Check if Ollama is running and list available models models = ollama.list().models available_models = [model.model for model in models] - if "qwen2" not in available_models and "qwen2:latest" not in available_models: - print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.") - except Exception: - print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.") + + # Check if any default models are available + if "qwen2" not in available_models and "qwen2:latest" not in available_models and \ + "llama3" not in available_models and "llama3:latest" not in available_models and \ + "phi3" not in available_models and "phi3:latest" not in available_models: + print("⚠️ Warning: Ollama is running but no default models (qwen2, llama3, phi3) are available.") + print("Please download a model through the Model Management tab or run:") + print(" ollama pull qwen2") + print(" ollama pull llama3") + print(" ollama pull phi3") + else: + available_default_models = [] + for model in ["qwen2", "llama3", "phi3"]: + if model in available_models or f"{model}:latest" in available_models: + available_default_models.append(model) + + print(f"✅ Ollama is running with available default models: {', '.join(available_default_models)}") + print(f"All available models: {', '.join(available_models)}") + except Exception as e: + print(f"⚠️ Warning: Ollama is installed but not running or encountered an error: {str(e)}") + print("Please start Ollama before using the interface.") except ImportError: print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama") @@ -674,17 +645,11 @@ def download_model(model_type: str) -> str: except Exception as e: return f"❌ Error downloading model: {str(e)}" - - elif "Ollama" in model_type: + # all ollama models + else: # Extract model name from model_type - if "llama3" in model_type.lower(): - model_name = "llama3" - elif "phi-3" in model_type.lower(): - model_name = "phi3" - elif "qwen2" in model_type.lower(): - model_name = "qwen2" - else: - return "❌ Error: Unknown Ollama model type" + # Remove the 'Ollama - ' prefix and any leading/trailing whitespace + model_name = model_type.replace("Ollama - ", "").strip() # Use Ollama to pull the model try: @@ -732,8 +697,6 @@ def download_model(model_type: str) -> str: return "❌ Error: Could not connect to Ollama. Please make sure Ollama is installed and running." except Exception as e: return f"❌ Error pulling Ollama model: {str(e)}" - else: - return "❌ Error: Unknown model type" except Exception as e: return f"❌ Error: {str(e)}" diff --git a/agentic_rag/k8s/MINIKUBE.md b/agentic_rag/k8s/MINIKUBE.md new file mode 100644 index 0000000..cd48157 --- /dev/null +++ b/agentic_rag/k8s/MINIKUBE.md @@ -0,0 +1,210 @@ +# Quick Start with Minikube + +This guide provides instructions for deploying the Agentic RAG system on Minikube for local testing. + +## Prerequisites + +1. [Minikube](https://minikube.sigs.k8s.io/docs/start/) installed +2. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) installed +3. Docker or another container runtime installed +4. NVIDIA GPU with appropriate drivers installed +5. [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed + +## Step 1: Start Minikube with GPU Support + +Start Minikube with sufficient resources and GPU support: + +```bash +# For Linux +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=kvm2 --gpu + +# For Windows +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperv --gpu + +# For macOS (Note: GPU passthrough is limited on macOS) +minikube start --cpus 4 --memory 16384 --disk-size 50g --driver=hyperkit +``` + +Verify that Minikube is running: + +```bash +minikube status +``` + +## Step 2: Install NVIDIA Device Plugin + +Install the NVIDIA device plugin to enable GPU support in Kubernetes: + +```bash +# Apply the NVIDIA device plugin +kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml +``` + +Verify that the GPU is available in the cluster: + +```bash +kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" +``` + +## Step 3: Clone the Repository + +Clone the repository containing the Kubernetes manifests: + +```bash +git clone https://github.com/devrel/devrel-labs.git +cd devrel-labs/agentic_rag/k8s +``` + +## Step 4: Deploy the Application + +The deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models. + +### Option 1: Deploy without a Hugging Face token (Ollama models only) + +```bash +# Create a namespace +kubectl create namespace agentic-rag + +# Create an empty ConfigMap +cat <`. + +## Troubleshooting + +### Pod Stuck in Pending State + +If the pod is stuck in Pending state, check the events: + +```bash +kubectl describe pod -l app=agentic-rag -n agentic-rag +``` + +Common issues include: + +1. **Insufficient resources**: Ensure your node pool has enough resources +2. **GPU not available**: Ensure your node pool has GPU-enabled nodes +3. **Image pull issues**: Check if the image can be pulled from the registry + +### GPU-Related Issues + +If you encounter GPU-related issues: + +1. **Check GPU availability in OKE**: + ```bash + kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" + ``` + +2. **Verify NVIDIA device plugin is running**: + ```bash + kubectl get pods -n kube-system | grep nvidia-device-plugin + ``` + +3. **Check if GPU is available to the pod**: + ```bash + kubectl describe pod -l app=agentic-rag -n agentic-rag | grep -A5 'Allocated resources' + ``` + +4. **Check NVIDIA driver installation on the node**: + ```bash + # Get the node name + NODE_NAME=$(kubectl get pod -l app=agentic-rag -n agentic-rag -o jsonpath='{.items[0].spec.nodeName}') + + # Create a debug pod on the node + kubectl debug node/$NODE_NAME -it --image=ubuntu + + # Inside the debug pod + chroot /host + nvidia-smi + ``` + +### Load Balancer Issues + +If the load balancer is not provisioning or not accessible: + +1. Check the service status: + ```bash + kubectl get service agentic-rag -n agentic-rag + ``` + +2. Check OCI Console for load balancer status and configuration + +3. Ensure your VCN security lists allow traffic to the load balancer + +## Scaling + +To scale the deployment: + +```bash +kubectl scale deployment agentic-rag -n agentic-rag --replicas=2 +``` + +Note: Each replica will require its own GPU. + +## Cleanup + +To remove all resources: + +```bash +kubectl delete namespace agentic-rag +``` + +To delete the OCI Load Balancer (if it's not automatically deleted): + +1. Navigate to the Load Balancers page in the OCI Console +2. Find the load balancer created for your service +3. Click "Delete" and confirm \ No newline at end of file diff --git a/agentic_rag/k8s/README_k8s.md b/agentic_rag/k8s/README_k8s.md new file mode 100644 index 0000000..a6bc5fd --- /dev/null +++ b/agentic_rag/k8s/README_k8s.md @@ -0,0 +1,95 @@ +# Kubernetes Deployment for Agentic RAG + +This directory contains Kubernetes manifests for deploying the Agentic RAG system. + +## Prerequisites + +- Kubernetes cluster (e.g., Oracle Kubernetes Engine, Minikube, or any other Kubernetes cluster) +- `kubectl` configured to access your cluster +- At least 8GB of RAM and 4 CPU cores available for the deployment + +## Deployment + +This deployment includes both Hugging Face models and Ollama for inference. The Hugging Face token is optional but recommended for using Mistral models. + +1. **Update the ConfigMap with your Hugging Face token** (optional but recommended): + + ```bash + # Edit the configmap.yaml file + nano local-deployment/configmap.yaml + + # Replace "your-huggingface-token" with your actual token + ``` + +2. **Deploy the application**: + + ```bash + kubectl apply -f local-deployment/configmap.yaml + kubectl apply -f local-deployment/deployment.yaml + kubectl apply -f local-deployment/service.yaml + ``` + +3. **Access the application**: + + If using LoadBalancer: + ```bash + kubectl get service agentic-rag + ``` + + If using NodePort: + ```bash + # Get the NodePort + kubectl get service agentic-rag + + # Access the application at http://: + ``` + +## Model Selection + +The deployment includes both Hugging Face models and Ollama models: + +- **Hugging Face Models**: Mistral-7B models (requires token in config.yaml) +- **Ollama Models**: llama3, phi3, and qwen2 (automatically downloaded during deployment) + +You can select which model to use from the Gradio interface after deployment. + +## Monitoring and Troubleshooting + +### Check pod status: + +```bash +kubectl get pods +``` + +### View logs: + +```bash +kubectl logs -f deployment/agentic-rag +``` + +### Shell into the pod: + +```bash +kubectl exec -it deployment/agentic-rag -- /bin/bash +``` + +## Scaling + +For production deployments, consider: + +1. Using persistent volumes for data storage +2. Adjusting resource requests and limits based on your workload +3. Setting up proper monitoring and logging +4. Implementing horizontal pod autoscaling + +## Cleanup + +To remove the deployment: + +```bash +kubectl delete -f local-deployment/ +``` + +## Future Work + +A distributed system deployment that separates the LLM inference system into its own service is planned for future releases. This will allow for better resource allocation and scaling in production environments. \ No newline at end of file diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index c26f99e..a64f4c7 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -57,8 +57,11 @@ def __init__(self, model_name: str): Args: model_name: Name of the Ollama model to use """ - # Remove the 'ollama:' prefix if present - self.model_name = model_name.replace("ollama:", "") if model_name.startswith("ollama:") else model_name + # Remove 'ollama:' prefix if present + if model_name and model_name.startswith("ollama:"): + model_name = model_name.replace("ollama:", "") + + self.model_name = model_name self._check_ollama_running() def _check_ollama_running(self): @@ -74,13 +77,11 @@ def _check_ollama_running(self): # Check if the requested model is available if self.model_name not in available_models: - # Try with :latest suffix - if f"{self.model_name}:latest" in available_models: - self.model_name = f"{self.model_name}:latest" - print(f"Using model with :latest suffix: {self.model_name}") - else: - print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}") - print(f"You can pull it with: ollama pull {self.model_name}") + print(f"Model '{self.model_name}' not found in Ollama. Available models: {', '.join(available_models)}") + print(f"You can pull it with: ollama pull {self.model_name}") + raise ValueError(f"Model '{self.model_name}' not found in Ollama") + else: + print(f"Using Ollama model: {self.model_name}") except Exception as e: raise ConnectionError(f"Failed to connect to Ollama. Please make sure Ollama is running. Error: {str(e)}") @@ -92,6 +93,9 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw try: import ollama + print(f"\nGenerating response with Ollama model: {self.model_name}") + print(f"Prompt: {prompt[:100]}...") # Print first 100 chars of prompt + # Generate text response = ollama.generate( model=self.model_name, @@ -103,6 +107,8 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw } ) + print(f"Response generated successfully with {self.model_name}") + # Format result to match transformers pipeline output formatted_result = [{ "generated_text": response["response"] @@ -114,7 +120,7 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw raise Exception(f"Failed to generate text with Ollama: {str(e)}") class LocalRAGAgent: - def __init__(self, vector_store: VectorStore = None, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", + def __init__(self, vector_store: VectorStore = None, model_name: str = None, use_cot: bool = False, collection: str = None, skip_analysis: bool = False, quantization: str = None, use_oracle_db: bool = True): """Initialize local RAG agent with vector store and local LLM @@ -128,6 +134,13 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala quantization: Quantization method to use (None, '4bit', '8bit') use_oracle_db: Whether to use Oracle DB for vector storage (if False, uses ChromaDB) """ + print(f"LocalRAGAgent init - model_name: {model_name}") + + # Set default model if none provided + if model_name is None: + model_name = "qwen2" + print(f"Using default model: {model_name}") + # Initialize vector store if not provided self.use_oracle_db = use_oracle_db and ORACLE_DB_AVAILABLE @@ -162,106 +175,71 @@ def __init__(self, vector_store: VectorStore = None, model_name: str = "mistrala self.collection = collection self.quantization = quantization self.model_name = model_name + print('Model Name after assignment:', self.model_name) # skip_analysis parameter kept for backward compatibility but no longer used - # Check if this is an Ollama model - self.is_ollama = model_name.startswith("ollama:") + # Check if this is an Ollama model (anything not Mistral is considered Ollama) + self.is_ollama = not (model_name and "mistral" in model_name.lower()) if self.is_ollama: - # Extract the actual model name from the prefix - ollama_model_name = model_name.replace("ollama:", "") + # Remove 'ollama:' prefix if present + if model_name and model_name.startswith("ollama:"): + model_name = model_name.replace("ollama:", "") + + # Always append :latest to Ollama model names + if not model_name.endswith(":latest"): + model_name = f"{model_name}:latest" # Load Ollama model print("\nLoading Ollama model...") - print(f"Model: {ollama_model_name}") + print(f"Model: {model_name}") print("Note: Make sure Ollama is running on your system.") # Initialize Ollama model handler - self.ollama_handler = OllamaModelHandler(ollama_model_name) + self.ollama_handler = OllamaModelHandler(model_name) # Create pipeline-like interface self.pipeline = self.ollama_handler - + print(f"Using Ollama model: {model_name}") else: - # Load HuggingFace token from config - try: - with open('config.yaml', 'r') as f: - config = yaml.safe_load(f) - token = config.get('HUGGING_FACE_HUB_TOKEN') - if not token: - raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml") - except Exception as e: - raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}") - - # Load model and tokenizer - print("\nLoading model and tokenizer...") - print(f"Model: {model_name}") - if quantization: - print(f"Quantization: {quantization}") - print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.") - print("Subsequent queries will be faster but may still take 30-60 seconds per response.") - - # Check if CUDA is available and set appropriate dtype - if torch.cuda.is_available(): - print("CUDA is available. Using GPU acceleration.") - dtype = torch.float16 + # Only initialize Mistral if no model is specified + if not model_name: + print("\nLoading default model and tokenizer...") + print("Model: mistralai/Mistral-7B-Instruct-v0.2") + self.model_name = "mistralai/Mistral-7B-Instruct-v0.2" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map="auto", + torch_dtype=torch.float16, + load_in_8bit=quantization == "8bit", + load_in_4bit=quantization == "4bit" + ) + self.pipeline = pipeline( + "text-generation", + model=self.model, + tokenizer=self.tokenizer, + device_map="auto" + ) + print(f"Using default model: {self.model_name}") else: - print("CUDA is not available. Using CPU only (this will be slow).") - dtype = torch.float32 - - # Set up model loading parameters - model_kwargs = { - "torch_dtype": dtype, - "device_map": "auto", - "token": token, - "low_cpu_mem_usage": True, - "offload_folder": "offload" - } - - # Apply quantization if specified - if quantization == '4bit': - try: - from transformers import BitsAndBytesConfig - quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4" - ) - model_kwargs["quantization_config"] = quantization_config - print("Using 4-bit quantization with bitsandbytes") - except ImportError: - print("Warning: bitsandbytes not installed. Falling back to standard loading.") - print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes") - elif quantization == '8bit': - try: - from transformers import BitsAndBytesConfig - quantization_config = BitsAndBytesConfig(load_in_8bit=True) - model_kwargs["quantization_config"] = quantization_config - print("Using 8-bit quantization with bitsandbytes") - except ImportError: - print("Warning: bitsandbytes not installed. Falling back to standard loading.") - print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes") - - # Load model with appropriate settings - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - **model_kwargs - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token) - - # Create text generation pipeline with optimized settings - self.pipeline = pipeline( - "text-generation", - model=self.model, - tokenizer=self.tokenizer, - max_new_tokens=512, - do_sample=True, - temperature=0.1, - top_p=0.95, - device_map="auto" - ) - print("✓ Model loaded successfully") + print(f"\nUsing specified model: {model_name}") + self.model_name = model_name + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map="auto", + torch_dtype=torch.float16, + load_in_8bit=quantization == "8bit", + load_in_4bit=quantization == "4bit" + ) + self.pipeline = pipeline( + "text-generation", + model=self.model, + tokenizer=self.tokenizer, + device_map="auto" + ) + print(f"Using specified model: {self.model_name}") # Create LLM wrapper self.llm = LocalLLM(self.pipeline) @@ -515,7 +493,7 @@ def main(): parser = argparse.ArgumentParser(description="Query documents using local LLM") parser.add_argument("--query", required=True, help="Query to search for") parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use") - parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)") + parser.add_argument("--model", default="qwen2", help="Model to use (default: qwen2)") parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)") parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning") parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store") @@ -534,6 +512,7 @@ def main(): print("\nInitializing RAG agent...") print("=" * 50) + print(f"Using model: {args.model}") try: # Determine which vector store to use based on args.embeddings @@ -560,6 +539,7 @@ def main(): # Set use_oracle_db based on the actual store type use_oracle_db = args.embeddings == "oracle" and isinstance(store, OraDBVectorStore) + print(f"Creating LocalRAGAgent with model: {args.model}") agent = LocalRAGAgent( store, model_name=args.model,