From f193f3e535543ef5f460783efc376cc2b18754ca Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Thu, 10 Apr 2025 19:12:46 +0200 Subject: [PATCH 1/6] Add Oracle DB 23ai integration with secure credential handling. Implements OraDBVectorStore as an alternative vector storage backend with ChromaDB fallback support. Adds credential loading from config.yaml, comprehensive documentation, and testing tools. --- agentic_rag/OraDBVectorStore.py | 385 ++++++++++++++++++++++ agentic_rag/config_example.yaml | 11 +- agentic_rag/docs/oracle_db_integration.md | 102 ++++++ agentic_rag/gradio_app.py | 36 +- agentic_rag/local_rag_agent.py | 64 +++- agentic_rag/requirements.txt | 4 +- agentic_rag/test_db_systems.sh | 31 ++ agentic_rag/test_oradb.py | 155 +++++++++ 8 files changed, 775 insertions(+), 13 deletions(-) create mode 100644 agentic_rag/OraDBVectorStore.py create mode 100644 agentic_rag/docs/oracle_db_integration.md create mode 100644 agentic_rag/test_db_systems.sh create mode 100644 agentic_rag/test_oradb.py diff --git a/agentic_rag/OraDBVectorStore.py b/agentic_rag/OraDBVectorStore.py new file mode 100644 index 0000000..56d3937 --- /dev/null +++ b/agentic_rag/OraDBVectorStore.py @@ -0,0 +1,385 @@ +from typing import List, Dict, Any +import json +import argparse +from sentence_transformers import SentenceTransformer +import array +import oracledb +import yaml +import os +from pathlib import Path + + +class OraDBVectorStore: + def __init__(self, persist_directory: str = "embeddings"): + """Initialize Oracle DB Vector Store + + Args: + persist_directory: Not used for Oracle DB connection but kept for compatibility + """ + # Load Oracle DB credentials from config.yaml + credentials = self._load_config() + + username = credentials.get("ORACLE_DB_USERNAME", "ADMIN") + password = credentials.get("ORACLE_DB_PASSWORD", "") + dsn = credentials.get("ORACLE_DB_DSN", "") + + if not password or not dsn: + raise ValueError("Oracle DB credentials not found in config.yaml. Please set ORACLE_DB_USERNAME, ORACLE_DB_PASSWORD, and ORACLE_DB_DSN.") + + # Connect to the database + try: + conn23c = oracledb.connect(user=username, password=password, dsn=dsn) + print("Oracle DB Connection successful!") + except Exception as e: + print("Oracle DB Connection failed!", e) + raise + + # Create a table to store the data + cursor = conn23c.cursor() + + self.connection = conn23c + self.cursor = cursor + + sql = """CREATE TABLE IF NOT EXISTS PDFCollection ( + id VARCHAR2(4000 BYTE) PRIMARY KEY, + text VARCHAR2(4000 BYTE), + metadata VARCHAR2(4000 BYTE), + embedding VECTOR + )""" + + cursor.execute(sql) + + sql = """CREATE TABLE IF NOT EXISTS WebCollection ( + id VARCHAR2(4000 BYTE) PRIMARY KEY, + text VARCHAR2(4000 BYTE), + metadata VARCHAR2(4000 BYTE), + embedding VECTOR + )""" + + cursor.execute(sql) + + sql = """CREATE TABLE IF NOT EXISTS RepoCollection ( + id VARCHAR2(4000 BYTE) PRIMARY KEY, + text VARCHAR2(4000 BYTE), + metadata VARCHAR2(4000 BYTE), + embedding VECTOR + )""" + + cursor.execute(sql) + + + sql = """CREATE TABLE IF NOT EXISTS GeneralCollection ( + id VARCHAR2(4000 BYTE) PRIMARY KEY, + text VARCHAR2(4000 BYTE), + metadata VARCHAR2(4000 BYTE), + embedding VECTOR + )""" + + cursor.execute(sql) + + self.encoder = SentenceTransformer('all-MiniLM-L12-v2') + + + def _load_config(self) -> Dict[str, str]: + """Load configuration from config.yaml""" + try: + config_path = Path("config.yaml") + if not config_path.exists(): + print("Warning: config.yaml not found. Using empty configuration.") + return {} + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + return config if config else {} + except Exception as e: + print(f"Warning: Error loading config: {str(e)}") + return {} + + def _sanitize_metadata(self, metadata: Dict) -> Dict: + """Sanitize metadata to ensure all values are valid types for Oracle DB""" + sanitized = {} + for key, value in metadata.items(): + if isinstance(value, (str, int, float, bool)): + sanitized[key] = value + elif isinstance(value, list): + # Convert list to string representation + sanitized[key] = str(value) + elif value is None: + # Replace None with empty string + sanitized[key] = "" + else: + # Convert any other type to string + sanitized[key] = str(value) + return sanitized + + def add_pdf_chunks(self, chunks: List[Dict[str, Any]], document_id: str): + """Add chunks from a PDF document to the vector store""" + if not chunks: + return + + # Prepare data for Oracle DB + texts = [chunk["text"] for chunk in chunks] + metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks] + ids = [f"{document_id}_{i}" for i in range(len(chunks))] + + # Encode all texts in a batch + embeddings = self.encoder.encode(texts, batch_size=32, show_progress_bar=True) + + table_name = "PDFCollection" + # Truncate the table + self.cursor.execute(f"truncate table {table_name}") + + # Insert embeddings into Oracle + for i, (docid, text, metadata, embedding) in enumerate(zip(ids, texts, metadatas, embeddings), start=1): + json_metadata = json.dumps(metadata) # Convert to JSON string + vector = array.array("f", embedding) + + self.cursor.execute( + "INSERT INTO PDFCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)", + (docid, text, json_metadata, vector) + ) + + self.connection.commit() + + def add_web_chunks(self, chunks: List[Dict[str, Any]], source_id: str): + """Add chunks from web content to the vector store""" + if not chunks: + return + + # Prepare data for Oracle DB + texts = [chunk["text"] for chunk in chunks] + metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks] + ids = [f"{source_id}_{i}" for i in range(len(chunks))] + + # Encode all texts in a batch + embeddings = self.encoder.encode(texts, batch_size=32, show_progress_bar=True) + + table_name = "WebCollection" + # No truncation for web chunks, just append new ones + + # Insert embeddings into Oracle + for i, (docid, text, metadata, embedding) in enumerate(zip(ids, texts, metadatas, embeddings), start=1): + json_metadata = json.dumps(metadata) # Convert to JSON string + vector = array.array("f", embedding) + + self.cursor.execute( + "INSERT INTO WebCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)", + (docid, text, json_metadata, vector) + ) + + self.connection.commit() + + def add_general_knowledge(self, chunks: List[Dict[str, Any]], source_id: str): + """Add general knowledge chunks to the vector store""" + if not chunks: + return + + # Prepare data for Oracle DB + texts = [chunk["text"] for chunk in chunks] + metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks] + ids = [f"{source_id}_{i}" for i in range(len(chunks))] + + # Encode all texts in a batch + embeddings = self.encoder.encode(texts, batch_size=32, show_progress_bar=True) + + table_name = "GeneralCollection" + + # Insert embeddings into Oracle + for i, (docid, text, metadata, embedding) in enumerate(zip(ids, texts, metadatas, embeddings), start=1): + json_metadata = json.dumps(metadata) # Convert to JSON string + vector = array.array("f", embedding) + + self.cursor.execute( + "INSERT INTO GeneralCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)", + (docid, text, json_metadata, vector) + ) + + self.connection.commit() + + def add_repo_chunks(self, chunks: List[Dict[str, Any]], document_id: str): + """Add chunks from a repository to the vector store""" + if not chunks: + return + + # Prepare data for Oracle DB + texts = [chunk["text"] for chunk in chunks] + metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks] + ids = [f"{document_id}_{i}" for i in range(len(chunks))] + + # Encode all texts in a batch + embeddings = self.encoder.encode(texts, batch_size=32, show_progress_bar=True) + + table_name = "RepoCollection" + + # Insert embeddings into Oracle + for i, (docid, text, metadata, embedding) in enumerate(zip(ids, texts, metadatas, embeddings), start=1): + json_metadata = json.dumps(metadata) # Convert to JSON string + vector = array.array("f", embedding) + + self.cursor.execute( + "INSERT INTO RepoCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)", + (docid, text, json_metadata, vector) + ) + + self.connection.commit() + + def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: + """Query the PDF documents collection""" + # Generate Embeddings + embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) + new_vector = array.array("f", embeddings) + + sql = """ + SELECT Id, Text, MetaData, Embedding + FROM PDFCOLLECTION + ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN) + FETCH FIRST 10 ROWS ONLY + """ + + self.cursor.execute(sql, {"nv": new_vector}) + + # Fetch all rows + rows = self.cursor.fetchall() + + # Format results + formatted_results = [] + for row in rows: + result = { + "content": row[1], + "metadata": json.loads(row[2]) if isinstance(row[2], str) else row[2] + } + formatted_results.append(result) + + return formatted_results + + def query_web_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: + """Query the web documents collection""" + # Generate Embeddings + embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) + new_vector = array.array("f", embeddings) + + sql = """ + SELECT Id, Text, MetaData, Embedding + FROM WebCOLLECTION + ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN) + FETCH FIRST 10 ROWS ONLY + """ + + self.cursor.execute(sql, {"nv": new_vector}) + + # Fetch all rows + rows = self.cursor.fetchall() + + # Format results + formatted_results = [] + for row in rows: + result = { + "content": row[1], + "metadata": json.loads(row[2]) if isinstance(row[2], str) else row[2] + } + formatted_results.append(result) + + return formatted_results + + def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: + """Query the general knowledge collection""" + # Generate Embeddings + embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) + new_vector = array.array("f", embeddings) + + sql = """ + SELECT Id, Text, MetaData, Embedding + FROM GeneralCollection + ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN) + FETCH FIRST 10 ROWS ONLY + """ + + self.cursor.execute(sql, {"nv": new_vector}) + + # Fetch all rows + rows = self.cursor.fetchall() + + # Format results + formatted_results = [] + for row in rows: + result = { + "content": row[1], + "metadata": json.loads(row[2]) if isinstance(row[2], str) else row[2] + } + formatted_results.append(result) + + return formatted_results + + def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: + """Query the repository documents collection""" + # Generate Embeddings + embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) + new_vector = array.array("f", embeddings) + + sql = """ + SELECT Id, Text, MetaData, Embedding + FROM RepoCOLLECTION + ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN) + FETCH FIRST 10 ROWS ONLY + """ + + self.cursor.execute(sql, {"nv": new_vector}) + + # Fetch all rows + rows = self.cursor.fetchall() + + # Format results + formatted_results = [] + for row in rows: + result = { + "content": row[1], + "metadata": json.loads(row[2]) if isinstance(row[2], str) else row[2] + } + formatted_results.append(result) + + return formatted_results + +def main(): + parser = argparse.ArgumentParser(description="Manage Oracle DB vector store") + parser.add_argument("--add", help="JSON file containing chunks to add") + parser.add_argument("--add-web", help="JSON file containing web chunks to add") + parser.add_argument("--query", help="Query to search for") + + args = parser.parse_args() + store = OraDBVectorStore() + + if args.add: + with open(args.add, 'r', encoding='utf-8') as f: + chunks = json.load(f) + store.add_pdf_chunks(chunks, document_id=args.add) + print(f"✓ Added {len(chunks)} PDF chunks to Oracle DB vector store") + + if args.add_web: + with open(args.add_web, 'r', encoding='utf-8') as f: + chunks = json.load(f) + store.add_web_chunks(chunks, source_id=args.add_web) + print(f"✓ Added {len(chunks)} web chunks to Oracle DB vector store") + + if args.query: + # Query both collections + pdf_results = store.query_pdf_collection(args.query) + web_results = store.query_web_collection(args.query) + + print("\nPDF Results:") + print("-" * 50) + for result in pdf_results: + print(f"Content: {result['content'][:200]}...") + print(f"Source: {result['metadata'].get('source', 'Unknown')}") + print(f"Pages: {result['metadata'].get('page_numbers', [])}") + print("-" * 50) + + print("\nWeb Results:") + print("-" * 50) + for result in web_results: + print(f"Content: {result['content'][:200]}...") + print(f"Source: {result['metadata'].get('source', 'Unknown')}") + print(f"Title: {result['metadata'].get('title', 'Unknown')}") + print("-" * 50) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/agentic_rag/config_example.yaml b/agentic_rag/config_example.yaml index 8c77e47..368bfb5 100644 --- a/agentic_rag/config_example.yaml +++ b/agentic_rag/config_example.yaml @@ -1 +1,10 @@ -HUGGING_FACE_HUB_TOKEN: your_token_here \ No newline at end of file +HUGGING_FACE_HUB_TOKEN: your_token_here + +# Oracle DB Configuration +ORACLE_DB_USERNAME: ADMIN +ORACLE_DB_PASSWORD: your_password_here +ORACLE_DB_DSN: >- + (description= (retry_count=20)(retry_delay=3) + (address=(protocol=tcps)(port=1522)(host=your-oracle-db-host.com)) + (connect_data=(service_name=your-service-name)) + (security=(ssl_server_dn_match=yes))) \ No newline at end of file diff --git a/agentic_rag/docs/oracle_db_integration.md b/agentic_rag/docs/oracle_db_integration.md new file mode 100644 index 0000000..a318b49 --- /dev/null +++ b/agentic_rag/docs/oracle_db_integration.md @@ -0,0 +1,102 @@ +# Oracle DB 23ai Integration + +The Agentic RAG system now supports Oracle DB 23ai as a vector store backend, providing enhanced performance, scalability, and enterprise-grade database features. + +## Overview + +Oracle Database 23ai is used as the default vector storage system when available, with ChromaDB serving as a fallback option. This integration leverages Oracle's vector database capabilities for efficient semantic search and retrieval. + +## Requirements + +To use the Oracle DB integration, you need: + +1. **Oracle Database 23ai**: With vector extensions enabled +2. **Python Packages**: + - `oracledb`: For database connectivity + - `sentence-transformers`: For generating embeddings + +## Installation + +1. Install the required packages: + +```bash +pip install oracledb sentence-transformers +``` + +2. Configure your Oracle Database connection in `config.yaml`: + +```yaml +# Oracle DB Configuration +ORACLE_DB_USERNAME: ADMIN +ORACLE_DB_PASSWORD: your_password_here +ORACLE_DB_DSN: >- + (description= (retry_count=20)(retry_delay=3) + (address=(protocol=tcps)(port=1522)(host=your-oracle-db-host.com)) + (connect_data=(service_name=your-service-name)) + (security=(ssl_server_dn_match=yes))) +``` + +The system will automatically look for these credentials in your `config.yaml` file. If not found, it will raise an error and fall back to ChromaDB. + +## How It Works + +The system automatically determines which database to use: + +1. First tries to connect to Oracle DB 23ai +2. If connection succeeds, uses Oracle for all vector operations +3. If Oracle DB is unavailable, falls back to ChromaDB + +## Database Structure + +The Oracle DB integration creates the following tables: + +- `PDFCollection`: Stores chunks from PDF documents +- `WebCollection`: Stores chunks from web content +- `RepoCollection`: Stores chunks from code repositories +- `GeneralCollection`: Stores general knowledge chunks + +Each table has the following structure: +- `id`: Primary key identifier +- `text`: The text content of the chunk +- `metadata`: JSON string containing metadata (source, page, etc.) +- `embedding`: Vector representation of the text + +## Testing + +You can test the Oracle DB integration using: + +```bash +python test_oradb.py +``` + +Or test both systems using: + +```bash +./test_db_systems.sh +``` + +## Switching Between Databases + +You can force the system to use ChromaDB instead of Oracle DB by setting the `use_oracle_db` parameter to `False`: + +```python +agent = LocalRAGAgent(use_oracle_db=False) +``` + +## Gradio Interface + +The Gradio web interface displays which database system is active at the top of the page: + +- Green banner: Oracle DB 23ai is active +- Red banner: ChromaDB is being used (Oracle DB not available) + +## Troubleshooting + +If you encounter database connection issues: + +1. Verify your Oracle DB credentials and connection string +2. Check that the Oracle DB 23ai instance is running +3. Ensure you have the required Python packages installed +4. Check network connectivity to the database server + +If Oracle DB connection fails, the system will automatically fall back to ChromaDB without requiring any user intervention. \ No newline at end of file diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 1aa74dd..8205743 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -12,6 +12,14 @@ from web_processor import WebProcessor from repo_processor import RepoProcessor from store import VectorStore + +# Try to import OraDBVectorStore +try: + from OraDBVectorStore import OraDBVectorStore + ORACLE_DB_AVAILABLE = True +except ImportError: + ORACLE_DB_AVAILABLE = False + from local_rag_agent import LocalRAGAgent from rag_agent import RAGAgent @@ -32,7 +40,19 @@ def load_config(): pdf_processor = PDFProcessor() web_processor = WebProcessor() repo_processor = RepoProcessor() -vector_store = VectorStore() + +# Initialize vector store (prefer Oracle DB if available) +if ORACLE_DB_AVAILABLE: + try: + vector_store = OraDBVectorStore() + print("Using Oracle DB 23ai for vector storage") + except Exception as e: + print(f"Error initializing Oracle DB: {str(e)}") + print("Falling back to ChromaDB") + vector_store = VectorStore() +else: + vector_store = VectorStore() + print("Using ChromaDB for vector storage (Oracle DB not available)") # Initialize agents hf_token = load_config() @@ -261,6 +281,20 @@ def create_interface(): > **Note on Performance**: When using the Local (Mistral) model, initial loading can take 1-5 minutes, and each query may take 30-60 seconds to process depending on your hardware. OpenAI queries are typically much faster. """) + # Show Oracle DB status + if ORACLE_DB_AVAILABLE and hasattr(vector_store, 'connection'): + gr.Markdown(""" +
+ ✅ Oracle DB 23ai is active and being used for vector storage. +
+ """) + else: + gr.Markdown(""" +
+ ⚠️ ChromaDB is being used for vector storage. Oracle DB 23ai is not available. +
+ """) + # Create model choices list for reuse model_choices = [] # HF models first if token is available diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 3e539ec..74d7b3a 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -10,6 +10,12 @@ import time import json from pathlib import Path +try: + from OraDBVectorStore import OraDBVectorStore + ORACLE_DB_AVAILABLE = True +except ImportError: + ORACLE_DB_AVAILABLE = False + print("Oracle DB support not available. Install with: pip install oracledb sentence-transformers") # Configure logging logging.basicConfig( @@ -108,20 +114,50 @@ def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kw raise Exception(f"Failed to generate text with Ollama: {str(e)}") class LocalRAGAgent: - def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", + def __init__(self, vector_store: VectorStore = None, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", use_cot: bool = False, collection: str = None, skip_analysis: bool = False, - quantization: str = None): + quantization: str = None, use_oracle_db: bool = True): """Initialize local RAG agent with vector store and local LLM Args: - vector_store: Vector store for retrieving context + vector_store: Vector store for retrieving context (if None, will create one) model_name: HuggingFace model name/path or Ollama model name use_cot: Whether to use Chain of Thought reasoning collection: Collection to search in (PDF, Repository, or General Knowledge) skip_analysis: Whether to skip query analysis (kept for backward compatibility) quantization: Quantization method to use (None, '4bit', '8bit') + use_oracle_db: Whether to use Oracle DB for vector storage (if False, uses ChromaDB) """ - self.vector_store = vector_store + # Initialize vector store if not provided + self.use_oracle_db = use_oracle_db and ORACLE_DB_AVAILABLE + + if vector_store is None: + if self.use_oracle_db: + try: + self.vector_store = OraDBVectorStore() + print("Using Oracle DB for vector storage") + except ValueError as ve: + if "credentials not found" in str(ve): + print(f"Oracle DB credentials not found in config.yaml: {str(ve)}") + print("Falling back to ChromaDB") + else: + print(f"Oracle DB initialization error: {str(ve)}") + print("Falling back to ChromaDB") + self.vector_store = VectorStore(persist_directory="embeddings") + self.use_oracle_db = False + except Exception as e: + print(f"Error initializing Oracle DB: {str(e)}") + print("Falling back to ChromaDB") + self.vector_store = VectorStore(persist_directory="embeddings") + self.use_oracle_db = False + else: + self.vector_store = VectorStore(persist_directory="embeddings") + print("Using ChromaDB for vector storage") + else: + self.vector_store = vector_store + # Determine type of vector store + self.use_oracle_db = hasattr(vector_store, 'connection') and hasattr(vector_store, 'cursor') + self.use_cot = use_cot self.collection = collection self.quantization = quantization @@ -231,7 +267,7 @@ def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistr self.llm = LocalLLM(self.pipeline) # Initialize specialized agents if CoT is enabled - self.agents = create_agents(self.llm, vector_store) if use_cot else None + self.agents = create_agents(self.llm, self.vector_store) if use_cot else None def process_query(self, query: str) -> Dict[str, Any]: """Process a user query using the agentic RAG pipeline""" @@ -416,16 +452,24 @@ def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[ if context: # Group sources by document for item in context: - source = item['metadata'].get('source', 'Unknown') + # Handle metadata which could be a string (from Oracle DB) or a dict (from ChromaDB) + metadata = item['metadata'] + if isinstance(metadata, str): + try: + metadata = json.loads(metadata) + except json.JSONDecodeError: + metadata = {"source": "Unknown"} + + source = metadata.get('source', 'Unknown') if source not in sources: sources[source] = set() # Add page number if available - if 'page' in item['metadata']: - sources[source].add(str(item['metadata']['page'])) + if 'page' in metadata: + sources[source].add(str(metadata['page'])) # Add file path if available for code - if 'file_path' in item['metadata']: - sources[source] = item['metadata']['file_path'] + if 'file_path' in metadata: + sources[source] = metadata['file_path'] # Print concise source information print("\nSources detected:") diff --git a/agentic_rag/requirements.txt b/agentic_rag/requirements.txt index cdb3273..6b6af43 100644 --- a/agentic_rag/requirements.txt +++ b/agentic_rag/requirements.txt @@ -17,4 +17,6 @@ lxml_html_clean langchain gitingest bitsandbytes -ollama \ No newline at end of file +ollama +oracledb +sentence-transformers \ No newline at end of file diff --git a/agentic_rag/test_db_systems.sh b/agentic_rag/test_db_systems.sh new file mode 100644 index 0000000..9714fba --- /dev/null +++ b/agentic_rag/test_db_systems.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Script to test both Oracle DB and ChromaDB for the Agentic RAG system + +echo "===== Agentic RAG Database Systems Test =====" +echo + +# Check for required packages +echo "Checking for required Python packages..." +pip list | grep -E "oracledb|sentence-transformers|chromadb" || echo "Some required packages may be missing" + +echo +echo "===== Testing Oracle DB =====" +echo "Running Oracle DB connection test..." +python test_oradb.py + +echo +echo "===== Testing ChromaDB =====" +echo "Creating a test query using ChromaDB as fallback..." +python -c ' +from local_rag_agent import LocalRAGAgent +agent = LocalRAGAgent(use_oracle_db=False) +print("ChromaDB initialized successfully") +print("Querying with test prompt...") +result = agent.process_query("What is machine learning?") +print(f"Response generated successfully. Length: {len(result['answer'])}") +' + +echo +echo "===== Testing Done =====" +echo "If both tests passed, your system is correctly configured for dual database support." +echo "Oracle DB will be used by default, with ChromaDB as fallback." \ No newline at end of file diff --git a/agentic_rag/test_oradb.py b/agentic_rag/test_oradb.py new file mode 100644 index 0000000..4e25c94 --- /dev/null +++ b/agentic_rag/test_oradb.py @@ -0,0 +1,155 @@ +import argparse +import json +from OraDBVectorStore import OraDBVectorStore +import time +import sys +import yaml +from pathlib import Path + +def check_credentials(): + """Check if Oracle DB credentials are configured in config.yaml""" + try: + config_path = Path("config.yaml") + if not config_path.exists(): + print("✗ config.yaml not found.") + return False + + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + if not config: + print("✗ config.yaml is empty or invalid YAML.") + return False + + # Check for Oracle DB credentials + if not config.get("ORACLE_DB_USERNAME"): + print("✗ ORACLE_DB_USERNAME not found in config.yaml") + return False + + if not config.get("ORACLE_DB_PASSWORD"): + print("✗ ORACLE_DB_PASSWORD not found in config.yaml") + return False + + if not config.get("ORACLE_DB_DSN"): + print("✗ ORACLE_DB_DSN not found in config.yaml") + return False + + print("✓ Oracle DB credentials found in config.yaml") + return True + except Exception as e: + print(f"✗ Error checking credentials: {str(e)}") + return False + +def test_connection(): + """Test connection to Oracle DB""" + print("Testing Oracle DB connection...") + try: + store = OraDBVectorStore() + print("✓ Connection successful!") + return store + except Exception as e: + print(f"✗ Connection failed: {str(e)}") + return None + +def test_add_and_query(store, query_text="machine learning"): + """Test adding simple data and querying it""" + if not store: + print("Skipping add and query test as connection failed") + return + + print("\nTesting add and query functionality...") + + # Create simple test document + test_chunks = [ + { + "text": "Machine learning is a field of study in artificial intelligence concerned with the development of algorithms that can learn from data.", + "metadata": { + "source": "Test Document", + "page": 1 + } + }, + { + "text": "Deep learning is a subset of machine learning that uses neural networks with many layers.", + "metadata": { + "source": "Test Document", + "page": 2 + } + } + ] + + try: + # Test adding PDF chunks + print("Adding test chunks to PDF collection...") + store.add_pdf_chunks(test_chunks, document_id="test_document") + print("✓ Successfully added test chunks") + + # Test querying + print(f"\nQuerying with: '{query_text}'") + start_time = time.time() + results = store.query_pdf_collection(query_text) + query_time = time.time() - start_time + + print(f"✓ Query completed in {query_time:.2f} seconds") + print(f"Found {len(results)} results") + + # Display results + if results: + print("\nResults:") + for i, result in enumerate(results): + print(f"\nResult {i+1}:") + print(f"Content: {result['content']}") + print(f"Source: {result['metadata'].get('source', 'Unknown')}") + print(f"Page: {result['metadata'].get('page', 'Unknown')}") + else: + print("No results found.") + + except Exception as e: + print(f"✗ Test failed: {str(e)}") + +def main(): + parser = argparse.ArgumentParser(description="Test Oracle DB Vector Store") + parser.add_argument("--query", default="machine learning", help="Query to use for testing") + + args = parser.parse_args() + + print("=== Oracle DB Vector Store Test ===\n") + + # Check if oracledb is installed + try: + import oracledb + print("✓ oracledb package is installed") + except ImportError: + print("✗ oracledb package is not installed.") + print("Please install it with: pip install oracledb") + sys.exit(1) + + # Check if sentence_transformers is installed + try: + import sentence_transformers + print("✓ sentence_transformers package is installed") + except ImportError: + print("✗ sentence_transformers package is not installed.") + print("Please install it with: pip install sentence-transformers") + sys.exit(1) + + # Check if credentials are configured + if not check_credentials(): + print("\n✗ Oracle DB credentials not properly configured in config.yaml") + print("Please update config.yaml with the following:") + print(""" +ORACLE_DB_USERNAME: ADMIN +ORACLE_DB_PASSWORD: your_password_here +ORACLE_DB_DSN: your_connection_string_here + """) + sys.exit(1) + + # Test connection + store = test_connection() + + # Test add and query functionality + test_add_and_query(store, args.query) + + print("\n=== Test Completed ===") + +if __name__ == "__main__": + main() \ No newline at end of file From af37695e013c04016207a42d55247df26a0e9452 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Thu, 10 Apr 2025 20:05:50 +0200 Subject: [PATCH 2/6] Add command-line option for selecting embeddings backend (Oracle DB or ChromaDB). Oracle DB is now the default, with ChromaDB as fallback. Update documentation. --- agentic_rag/README.md | 46 +++++++++++++++++++++++++++++++++- agentic_rag/local_rag_agent.py | 30 +++++++++++++++++++--- 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/agentic_rag/README.md b/agentic_rag/README.md index c046acd..39aa804 100644 --- a/agentic_rag/README.md +++ b/agentic_rag/README.md @@ -358,7 +358,7 @@ The system consists of several key components: 1. **PDF Processor**: we use `docling` to extract and chunk text from PDF documents 2. **Web Processor**: we use `trafilatura` to extract and chunk text from websites 3. **GitHub Repository Processor**: we use `gitingest` to extract and chunk text from repositories -4. **Vector Store**: Manages document embeddings and similarity search using `ChromaDB` and `Oracle Database 23ai` +4. **Vector Store**: Manages document embeddings and similarity search using `Oracle Database 23ai` (default) or `ChromaDB` (fallback) 5. **RAG Agent**: Makes intelligent decisions about query routing and response generation - OpenAI Agent: Uses `gpt-4-turbo-preview` for high-quality responses, but requires an OpenAI API key - Local Agent: Uses `Mistral-7B` as an open-source alternative @@ -373,6 +373,50 @@ The RAG Agent flow is the following: 4. If no PDF context is found OR if it's a general knowledge query, use the pre-trained LLM directly 5. Fall back to a "no information" response only in edge cases. +## Annex: Command Line Usage + +You can run the system from the command line using: + +```bash +python local_rag_agent.py --query "Your question here" [options] +``` + +### Command Line Arguments + +| Argument | Description | Default | +| --- | --- | --- | +| `--query` | The query to process | *Required* | +| `--embeddings` | Select embeddings backend (`oracle` or `chromadb`) | `oracle` | +| `--model` | Model to use for inference | `mistralai/Mistral-7B-Instruct-v0.2` | +| `--collection` | Collection to query (PDF, Repository, Web, General) | Auto-determined | +| `--use-cot` | Enable Chain of Thought reasoning | `False` | +| `--store-path` | Path to ChromaDB store (if using ChromaDB) | `embeddings` | +| `--skip-analysis` | Skip query analysis step | `False` | +| `--verbose` | Show full content of sources | `False` | +| `--quiet` | Disable verbose logging | `False` | + +### Examples + +Query using Oracle DB (default): +```bash +python local_rag_agent.py --query "How does vector search work?" +``` + +Force using ChromaDB: +```bash +python local_rag_agent.py --query "How does vector search work?" --embeddings chromadb +``` + +Query with Chain of Thought reasoning: +```bash +python local_rag_agent.py --query "Explain the difference between RAG and fine-tuning" --use-cot +``` + +Query a specific collection: +```bash +python local_rag_agent.py --query "How to implement a queue?" --collection "Repository Collection" +``` + ## Contributing This project is open source. Please submit your contributions by forking this repository and submitting a pull request! Oracle appreciates any contributions that are made by the open source community. diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 74d7b3a..59217d3 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -510,6 +510,8 @@ def main(): help="Specify which collection to query") parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis step") parser.add_argument("--verbose", action="store_true", help="Show full content of sources") + parser.add_argument("--embeddings", choices=["oracle", "chromadb"], default="oracle", + help="Select embeddings backend (default: oracle)") args = parser.parse_args() @@ -523,15 +525,37 @@ def main(): print("=" * 50) try: - logger.info(f"Initializing vector store from: {args.store_path}") - store = VectorStore(persist_directory=args.store_path) + # Determine which vector store to use based on args.embeddings + if args.embeddings == "oracle" and ORACLE_DB_AVAILABLE: + try: + logger.info("Initializing Oracle DB vector store") + store = OraDBVectorStore() + print("✓ Using Oracle DB for vector storage") + except Exception as e: + logger.warning(f"Failed to initialize Oracle DB: {str(e)}") + logger.info(f"Falling back to ChromaDB from: {args.store_path}") + store = VectorStore(persist_directory=args.store_path) + print("⚠ Oracle DB initialization failed, using ChromaDB instead") + else: + if args.embeddings == "oracle" and not ORACLE_DB_AVAILABLE: + logger.warning("Oracle DB support not available") + print("⚠ Oracle DB support not available (missing dependencies)") + + logger.info(f"Initializing ChromaDB vector store from: {args.store_path}") + store = VectorStore(persist_directory=args.store_path) + print("✓ Using ChromaDB for vector storage") + logger.info("Initializing local RAG agent...") + # Set use_oracle_db based on the actual store type + use_oracle_db = args.embeddings == "oracle" and isinstance(store, OraDBVectorStore) + agent = LocalRAGAgent( store, model_name=args.model, use_cot=args.use_cot, collection=args.collection, - skip_analysis=args.skip_analysis + skip_analysis=args.skip_analysis, + use_oracle_db=use_oracle_db ) print(f"\nProcessing query: {args.query}") From aab4bf4d824ced6664abf23ea65c68e8731136c7 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Thu, 10 Apr 2025 21:27:54 +0200 Subject: [PATCH 3/6] Set ollama:qwen2 as default model throughout the application --- agentic_rag/gradio_app.py | 84 ++++++++++++++++++++++------------ agentic_rag/local_rag_agent.py | 23 +++++----- 2 files changed, 67 insertions(+), 40 deletions(-) diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 8205743..50dd22e 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -59,7 +59,15 @@ def load_config(): openai_key = os.getenv("OPENAI_API_KEY") # Initialize agents with use_cot=True to ensure CoT is available -local_agent = LocalRAGAgent(vector_store, use_cot=True) if hf_token else None +# Default to Ollama qwen2, fall back to Mistral if available +try: + local_agent = LocalRAGAgent(vector_store, model_name="ollama:qwen2", use_cot=True) + print("Using Ollama qwen2 as default model") +except Exception as e: + print(f"Could not initialize Ollama qwen2: {str(e)}") + local_agent = LocalRAGAgent(vector_store, use_cot=True) if hf_token else None + print("Falling back to Local Mistral model" if hf_token else "No local model available") + openai_agent = RAGAgent(vector_store, openai_api_key=openai_key, use_cot=True) if openai_key else None def process_pdf(file: tempfile._TemporaryFileWrapper) -> str: @@ -313,6 +321,9 @@ def create_interface(): if openai_key: model_choices.append("OpenAI") + # Set default model to Ollama - qwen2 + default_model = "Ollama - qwen2" + # Model Management Tab (First Tab) with gr.Tab("Model Management"): gr.Markdown(""" @@ -320,13 +331,13 @@ def create_interface(): Download models in advance to prepare them for use in the chat interface. - ### Hugging Face Models (Default) + ### Hugging Face Models - The system uses Mistral-7B by default. For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file. + For Hugging Face models (Mistral), you'll need a Hugging Face token in your config.yaml file. - ### Ollama Models (Alternative) + ### Ollama Models (Default) - Ollama models are available as alternatives. For Ollama models, this will pull the model using the Ollama client. + Ollama models are used by default. For Ollama models, this will pull the model using the Ollama client. Make sure Ollama is installed and running on your system. You can download Ollama from [ollama.com/download](https://ollama.com/download) """) @@ -335,7 +346,7 @@ def create_interface(): with gr.Column(): model_dropdown = gr.Dropdown( choices=model_choices, - value=model_choices[0] if model_choices else None, + value=default_model if default_model in model_choices else model_choices[0] if model_choices else None, label="Select Model to Download", interactive=True ) @@ -350,6 +361,21 @@ def create_interface(): gr.Markdown(""" ### Model Information + **Ollama - qwen2** (DEFAULT): Alibaba's Qwen2 model via Ollama. + - Size: ~4GB + - Requires Ollama to be installed and running + - High-quality model with good performance + + **Ollama - llama3**: Meta's Llama 3 model via Ollama. + - Size: ~4GB + - Requires Ollama to be installed and running + - Excellent performance and quality + + **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama. + - Size: ~4GB + - Requires Ollama to be installed and running + - Efficient small model with good performance + **Local (Mistral)**: The default Mistral-7B-Instruct-v0.2 model. - Size: ~14GB - VRAM Required: ~8GB @@ -364,21 +390,6 @@ def create_interface(): - Size: ~7GB - VRAM Required: ~6GB - Balance between quality and memory usage - - **Ollama - llama3**: Meta's Llama 3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Excellent performance and quality - - **Ollama - phi-3**: Microsoft's Phi-3 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - Efficient small model with good performance - - **Ollama - qwen2**: Alibaba's Qwen2 model via Ollama. - - Size: ~4GB - - Requires Ollama to be installed and running - - High-quality model with good performance """) # Document Processing Tab @@ -412,7 +423,7 @@ def create_interface(): with gr.Column(scale=1): standard_agent_dropdown = gr.Dropdown( choices=model_choices, - value=model_choices[0] if model_choices else None, + value=default_model if default_model in model_choices else model_choices[0] if model_choices else None, label="Select Agent" ) with gr.Column(scale=1): @@ -441,7 +452,7 @@ def create_interface(): with gr.Column(scale=1): cot_agent_dropdown = gr.Dropdown( choices=model_choices, - value=model_choices[0] if model_choices else None, + value=default_model if default_model in model_choices else model_choices[0] if model_choices else None, label="Select Agent" ) with gr.Column(scale=1): @@ -536,7 +547,7 @@ def create_interface(): 2. **Standard Chat Interface**: - Quick responses without detailed reasoning steps - - Select your preferred agent (Local Mistral or OpenAI) + - Select your preferred agent (Ollama qwen2 by default) - Select which knowledge collection to query: - **PDF Collection**: Always searches PDF documents - **Repository Collection**: Always searches code repositories @@ -551,19 +562,36 @@ def create_interface(): - Same collection selection options as the Standard Chat Interface 4. **Performance Expectations**: + - **Ollama models**: Typically faster inference, default is qwen2 - **Local (Mistral) model**: Initial loading takes 1-5 minutes, each query takes 30-60 seconds - - **OpenAI model**: Much faster responses, typically a few seconds per query - - Chain of Thought reasoning takes longer for both models + - **OpenAI model**: Fast responses, typically a few seconds per query + - Chain of Thought reasoning takes longer for all models - Note: OpenAI agent requires an API key in `.env` file + Note: The interface will automatically detect available models based on your configuration: + - Ollama models are the default option (requires Ollama to be installed and running) + - Local Mistral model requires HuggingFace token in `config.yaml` (fallback option) + - OpenAI model requires API key in `.env` file """) return interface def main(): # Check configuration + try: + import ollama + try: + # Check if Ollama is running and qwen2 is available + models = ollama.list().models + available_models = [model.model for model in models] + if "qwen2" not in available_models and "qwen2:latest" not in available_models: + print("⚠️ Warning: Ollama is running but qwen2 model is not available. Please run 'ollama pull qwen2' or download through the interface.") + except Exception: + print("⚠️ Warning: Ollama is installed but not running or encountered an error. The default model may not work.") + except ImportError: + print("⚠️ Warning: Ollama package not installed. Please install with: pip install ollama") + if not hf_token and not openai_key: - print("⚠️ Warning: Neither HuggingFace token nor OpenAI key found. Please configure at least one.") + print("⚠️ Warning: Neither HuggingFace token nor OpenAI key found. Using Ollama only.") # Launch interface interface = create_interface() diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 59217d3..3ce242f 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -500,19 +500,18 @@ def _generate_general_response(self, query: str) -> Dict[str, Any]: } def main(): - parser = argparse.ArgumentParser(description="Query documents using local Mistral model") - parser.add_argument("--query", required=True, help="Query to process") - parser.add_argument("--store-path", default="embeddings", help="Path to the vector store") - parser.add_argument("--model", default="mistralai/Mistral-7B-Instruct-v0.2", help="Model to use") - parser.add_argument("--quiet", action="store_true", help="Disable verbose logging") - parser.add_argument("--use-cot", action="store_true", help="Enable Chain of Thought reasoning") - parser.add_argument("--collection", choices=["PDF Collection", "Repository Collection", "General Knowledge", "Web Knowledge Base"], - help="Specify which collection to query") - parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis step") + parser = argparse.ArgumentParser(description="Query documents using local LLM") + parser.add_argument("--query", required=True, help="Query to search for") + parser.add_argument("--embeddings", default="oracle", choices=["oracle", "chromadb"], help="Embeddings backend to use") + parser.add_argument("--model", default="ollama:qwen2", help="Model to use (default: ollama:qwen2)") + parser.add_argument("--collection", help="Collection to search (PDF, Repository, General Knowledge)") + parser.add_argument("--use-cot", action="store_true", help="Use Chain of Thought reasoning") + parser.add_argument("--store-path", default="embeddings", help="Path to ChromaDB store") + parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis (not recommended)") parser.add_argument("--verbose", action="store_true", help="Show full content of sources") - parser.add_argument("--embeddings", choices=["oracle", "chromadb"], default="oracle", - help="Select embeddings backend (default: oracle)") - + parser.add_argument("--quiet", action="store_true", help="Disable verbose logging") + parser.add_argument("--quantization", choices=["4bit", "8bit"], help="Quantization method (4bit or 8bit)") + args = parser.parse_args() # Set logging level based on quiet flag From bc2fd47b29c252f6bc730e8736bc79bfd74d179c Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Thu, 10 Apr 2025 21:34:35 +0200 Subject: [PATCH 4/6] feat: added comprehensive tests, default model qwen --- agentic_rag/OraDBVectorStore.py | 66 +++++++++++++++++++++++++++++++++ agentic_rag/README.md | 37 ++++++++++++++++++ agentic_rag/test_oradb.py | 63 ++++++++++++++++++++++++++++++- 3 files changed, 164 insertions(+), 2 deletions(-) diff --git a/agentic_rag/OraDBVectorStore.py b/agentic_rag/OraDBVectorStore.py index 56d3937..6863837 100644 --- a/agentic_rag/OraDBVectorStore.py +++ b/agentic_rag/OraDBVectorStore.py @@ -338,6 +338,72 @@ def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str formatted_results.append(result) return formatted_results + + def get_collection_count(self, collection_name: str) -> int: + """Get the total number of chunks in a collection + + Args: + collection_name: Name of the collection (pdf_documents, web_documents, repository_documents, general_knowledge) + + Returns: + Number of chunks in the collection + """ + # Map collection names to table names + collection_map = { + "pdf_documents": "PDFCollection", + "web_documents": "WebCollection", + "repository_documents": "RepoCollection", + "general_knowledge": "GeneralCollection" + } + + table_name = collection_map.get(collection_name) + if not table_name: + raise ValueError(f"Unknown collection name: {collection_name}") + + # Count the rows in the table + sql = f"SELECT COUNT(*) FROM {table_name}" + self.cursor.execute(sql) + count = self.cursor.fetchone()[0] + + return count + + def get_latest_chunk(self, collection_name: str) -> Dict[str, Any]: + """Get the most recently inserted chunk from a collection + + Args: + collection_name: Name of the collection (pdf_documents, web_documents, repository_documents, general_knowledge) + + Returns: + Dictionary containing the content and metadata of the latest chunk + """ + # Map collection names to table names + collection_map = { + "pdf_documents": "PDFCollection", + "web_documents": "WebCollection", + "repository_documents": "RepoCollection", + "general_knowledge": "GeneralCollection" + } + + table_name = collection_map.get(collection_name) + if not table_name: + raise ValueError(f"Unknown collection name: {collection_name}") + + # Get the most recently inserted row (using ID as a proxy for insertion time) + # This assumes IDs are assigned sequentially or have a timestamp component + sql = f"SELECT Id, Text, MetaData FROM {table_name} ORDER BY ROWID DESC FETCH FIRST 1 ROW ONLY" + self.cursor.execute(sql) + row = self.cursor.fetchone() + + if not row: + raise ValueError(f"No chunks found in collection: {collection_name}") + + result = { + "id": row[0], + "content": row[1], + "metadata": json.loads(row[2]) if isinstance(row[2], str) else row[2] + } + + return result def main(): parser = argparse.ArgumentParser(description="Manage Oracle DB vector store") diff --git a/agentic_rag/README.md b/agentic_rag/README.md index 39aa804..0cc0416 100644 --- a/agentic_rag/README.md +++ b/agentic_rag/README.md @@ -206,6 +206,43 @@ python store.py --query "your search query" python local_rag_agent.py --query "your search query" ``` +#### Test Oracle DB Vector Store + +The system includes a test script to verify Oracle DB connectivity and examine the contents of your collections. This is useful for: +- Checking if Oracle DB is properly configured +- Viewing statistics about your collections +- Inspecting the content stored in each collection +- Testing basic vector search functionality + +To run the test: + +```bash +# Basic test - checks connection and runs a test query +python test_oradb.py + +# Show only collection statistics without inserting test data +python test_oradb.py --stats-only + +# Specify a custom query for testing +python test_oradb.py --query "artificial intelligence" +``` + +The script will: +1. Verify Oracle DB credentials in your `config.yaml` file +2. Test connection to the Oracle DB +3. Display the total number of chunks in each collection (PDF, Web, Repository, General Knowledge) +4. Show content and metadata from the most recently inserted chunk in each collection +5. Unless running with `--stats-only`, insert test data and run a sample vector search + +Requirements: +- Oracle DB credentials properly configured in `config.yaml`: + ```yaml + ORACLE_DB_USERNAME: ADMIN + ORACLE_DB_PASSWORD: your_password_here + ORACLE_DB_DSN: your_connection_string_here + ``` +- The `oracledb` Python package installed + #### Use RAG Agent To query documents using either OpenAI or a local model, run: diff --git a/agentic_rag/test_oradb.py b/agentic_rag/test_oradb.py index 4e25c94..d4f8c81 100644 --- a/agentic_rag/test_oradb.py +++ b/agentic_rag/test_oradb.py @@ -51,6 +51,59 @@ def test_connection(): print(f"✗ Connection failed: {str(e)}") return None +def check_collection_stats(store): + """Check statistics for each collection including total chunks and latest insertion""" + if not store: + print("Skipping collection stats check as connection failed") + return + + print("\n=== Collection Statistics ===") + + collections = { + "PDF Collection": "pdf_documents", + "Repository Collection": "repository_documents", + "Web Knowledge Base": "web_documents", + "General Knowledge": "general_knowledge" + } + + for name, collection in collections.items(): + try: + # Get total count + count = store.get_collection_count(collection) + print(f"\n{name}:") + print(f"Total chunks: {count}") + + # Get latest insertion if collection is not empty + if count > 0: + latest = store.get_latest_chunk(collection) + print("Latest chunk:") + print(f" Content: {latest['content'][:150]}..." if len(latest['content']) > 150 else f" Content: {latest['content']}") + + # Print metadata + if isinstance(latest['metadata'], str): + try: + metadata = json.loads(latest['metadata']) + except: + metadata = {"source": latest['metadata']} + else: + metadata = latest['metadata'] + + source = metadata.get('source', 'Unknown') + print(f" Source: {source}") + + # Print other metadata based on collection type + if collection == "pdf_documents" and 'page' in metadata: + print(f" Page: {metadata['page']}") + elif collection == "repository_documents" and 'file_path' in metadata: + print(f" File: {metadata['file_path']}") + elif collection == "web_documents" and 'title' in metadata: + print(f" Title: {metadata['title']}") + else: + print("No chunks found in this collection.") + + except Exception as e: + print(f"Error checking {name}: {str(e)}") + def test_add_and_query(store, query_text="machine learning"): """Test adding simple data and querying it""" if not store: @@ -109,6 +162,7 @@ def test_add_and_query(store, query_text="machine learning"): def main(): parser = argparse.ArgumentParser(description="Test Oracle DB Vector Store") parser.add_argument("--query", default="machine learning", help="Query to use for testing") + parser.add_argument("--stats-only", action="store_true", help="Only show collection statistics without inserting test data") args = parser.parse_args() @@ -146,8 +200,13 @@ def main(): # Test connection store = test_connection() - # Test add and query functionality - test_add_and_query(store, args.query) + # Check collection statistics + check_collection_stats(store) + + # If stats-only flag is not set, also test add and query functionality + if not args.stats_only: + # Test add and query functionality + test_add_and_query(store, args.query) print("\n=== Test Completed ===") From cde12d8866cd7518b20c2b43713ddf82581a6f78 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Thu, 10 Apr 2025 22:22:42 +0200 Subject: [PATCH 5/6] Add logging to show which database (Oracle DB or ChromaDB) is being used for retrieval --- agentic_rag/OraDBVectorStore.py | 8 ++++++++ agentic_rag/local_rag_agent.py | 12 ++++++++++++ agentic_rag/store.py | 8 ++++++++ 3 files changed, 28 insertions(+) diff --git a/agentic_rag/OraDBVectorStore.py b/agentic_rag/OraDBVectorStore.py index 6863837..14b8c3a 100644 --- a/agentic_rag/OraDBVectorStore.py +++ b/agentic_rag/OraDBVectorStore.py @@ -225,6 +225,7 @@ def add_repo_chunks(self, chunks: List[Dict[str, Any]], document_id: str): def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the PDF documents collection""" + print("🔍 [Oracle DB] Querying PDF Collection") # Generate Embeddings embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) new_vector = array.array("f", embeddings) @@ -250,10 +251,12 @@ def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, } formatted_results.append(result) + print(f"🔍 [Oracle DB] Retrieved {len(formatted_results)} chunks from PDF Collection") return formatted_results def query_web_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the web documents collection""" + print("🔍 [Oracle DB] Querying Web Collection") # Generate Embeddings embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) new_vector = array.array("f", embeddings) @@ -279,10 +282,12 @@ def query_web_collection(self, query: str, n_results: int = 3) -> List[Dict[str, } formatted_results.append(result) + print(f"🔍 [Oracle DB] Retrieved {len(formatted_results)} chunks from Web Collection") return formatted_results def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the general knowledge collection""" + print("🔍 [Oracle DB] Querying General Knowledge Collection") # Generate Embeddings embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) new_vector = array.array("f", embeddings) @@ -308,10 +313,12 @@ def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[ } formatted_results.append(result) + print(f"🔍 [Oracle DB] Retrieved {len(formatted_results)} chunks from General Knowledge Collection") return formatted_results def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the repository documents collection""" + print("🔍 [Oracle DB] Querying Repository Collection") # Generate Embeddings embeddings = self.encoder.encode(query, batch_size=32, show_progress_bar=True) new_vector = array.array("f", embeddings) @@ -337,6 +344,7 @@ def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str } formatted_results.append(result) + print(f"🔍 [Oracle DB] Retrieved {len(formatted_results)} chunks from Repository Collection") return formatted_results def get_collection_count(self, collection_name: str) -> int: diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index 3ce242f..c26f99e 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -292,10 +292,16 @@ def _process_query_with_cot(self, query: str) -> Dict[str, Any]: try: # Get context based on collection type if self.collection == "PDF Collection": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving PDF Collection context") context = self.vector_store.query_pdf_collection(query) elif self.collection == "Repository Collection": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving Repository Collection context") context = self.vector_store.query_repo_collection(query) elif self.collection == "Web Knowledge Base": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving Web Knowledge Base context") context = self.vector_store.query_web_collection(query) else: context = [] @@ -389,10 +395,16 @@ def _process_query_standard(self, query: str) -> Dict[str, Any]: try: # Get context based on collection type if self.collection == "PDF Collection": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving PDF Collection context") context = self.vector_store.query_pdf_collection(query) elif self.collection == "Repository Collection": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving Repository Collection context") context = self.vector_store.query_repo_collection(query) elif self.collection == "Web Knowledge Base": + db_type = "Oracle DB" if self.use_oracle_db else "ChromaDB" + print(f"🔄 Using {db_type} for retrieving Web Knowledge Base context") context = self.vector_store.query_web_collection(query) else: context = [] diff --git a/agentic_rag/store.py b/agentic_rag/store.py index 1fb5768..94e7c0f 100644 --- a/agentic_rag/store.py +++ b/agentic_rag/store.py @@ -117,6 +117,7 @@ def add_repo_chunks(self, chunks: List[Dict[str, Any]], document_id: str): def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the PDF documents collection""" + print("📊 [ChromaDB] Querying PDF Collection") results = self.pdf_collection.query( query_texts=[query], n_results=n_results @@ -131,10 +132,12 @@ def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, } formatted_results.append(result) + print(f"📊 [ChromaDB] Retrieved {len(formatted_results)} chunks from PDF Collection") return formatted_results def query_web_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the web documents collection""" + print("📊 [ChromaDB] Querying Web Collection") results = self.web_collection.query( query_texts=[query], n_results=n_results @@ -149,10 +152,12 @@ def query_web_collection(self, query: str, n_results: int = 3) -> List[Dict[str, } formatted_results.append(result) + print(f"📊 [ChromaDB] Retrieved {len(formatted_results)} chunks from Web Collection") return formatted_results def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the general knowledge collection""" + print("📊 [ChromaDB] Querying General Knowledge Collection") results = self.general_collection.query( query_texts=[query], n_results=n_results @@ -167,10 +172,12 @@ def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[ } formatted_results.append(result) + print(f"📊 [ChromaDB] Retrieved {len(formatted_results)} chunks from General Knowledge Collection") return formatted_results def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]: """Query the repository documents collection""" + print("📊 [ChromaDB] Querying Repository Collection") results = self.repo_collection.query( query_texts=[query], n_results=n_results @@ -185,6 +192,7 @@ def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str } formatted_results.append(result) + print(f"📊 [ChromaDB] Retrieved {len(formatted_results)} chunks from Repository Collection") return formatted_results def main(): From 7fb6f126a89e7d7d907cda06ad1990154a975869 Mon Sep 17 00:00:00 2001 From: jasperan <23caj23@gmail.com> Date: Fri, 11 Apr 2025 00:26:48 +0200 Subject: [PATCH 6/6] feat: updated readme --- agentic_rag/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agentic_rag/README.md b/agentic_rag/README.md index 0cc0416..cea1624 100644 --- a/agentic_rag/README.md +++ b/agentic_rag/README.md @@ -10,7 +10,7 @@ The system has the following features: - Intelligent query routing - PDF processing using Docling for accurate text extraction and chunking -- Persistent vector storage with ChromaDB and Oracle Database 23ai (PDF and Websites) +- Persistent vector storage with Oracle Database 23ai (PDF and Websites) - Smart context retrieval and response generation - FastAPI-based REST API for document upload and querying - Support for both OpenAI-based agents or local, transformer-based agents (`Mistral-7B` by default)