feat: enhance RAG agents and PDF processing

jasperan · jasperan · commit dd1ac41c7bcb · 2025-02-19T01:06:23.000+01:00
diff --git a/agentic_rag/README.md b/agentic_rag/README.md
@@ -176,6 +176,8 @@ First, we process a document and query it using the local model. Then, we add th
 # 1. Process the PDF
 python pdf_processor.py --input example.pdf --output chunks.json
 
+#python pdf_processor.py --input https://arxiv.org/pdf/2203.06605 --output chunks.json
+
 # 2. Add to vector store
 python store.py --add chunks.json
 
diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py
@@ -157,14 +157,10 @@ def process_query(self, query: str) -> Dict[str, Any]:
         logger.info(f"- Requires context: {analysis.requires_context}")
         logger.info(f"- Reasoning: {analysis.reasoning}")
         
-        # If query type is unsupported, return early
+        # If query type is unsupported, use general knowledge
         if analysis.query_type == "unsupported":
-            logger.warning("Query type is unsupported")
-            return {
-                "answer": "I apologize, but I don't have the information to answer this query.",
-                "reasoning": analysis.reasoning,
-                "context": []
-            }
+            logger.info("Query type is unsupported, using general knowledge...")
+            return self._generate_general_response(query)
         
         # First try to get context from PDF documents
         logger.info("Querying PDF collection...")
@@ -187,19 +183,10 @@ def process_query(self, query: str) -> Dict[str, Any]:
             return response
         
         # If no PDF context found or if it's a general knowledge query,
-        # use the LLM directly
-        if analysis.query_type == "general_knowledge" or not context:
-            logger.info("No relevant PDF context found or general knowledge query detected")
-            logger.info("Falling back to direct LLM response...")
-            return self._generate_direct_response(query)
-        
-        # This case should rarely happen, but just in case
-        logger.warning("No relevant context found and query type is not general knowledge")
-        return {
-            "answer": "I couldn't find relevant information to answer your query.",
-            "reasoning": analysis.reasoning,
-            "context": []
-        }
+        # use general knowledge
+        logger.info("No relevant PDF context found or general knowledge query detected")
+        logger.info("Using general knowledge response...")
+        return self._generate_general_response(query)
     
     def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[str, Any]:
         """Generate a response using the retrieved context"""
@@ -240,6 +227,34 @@ def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[
             "context": context
         }
 
+    def _generate_general_response(self, query: str) -> Dict[str, Any]:
+        """Generate a response using general knowledge when no context is available"""
+        logger.info("Generating general knowledge response...")
+        
+        if self.use_cot:
+            prompt = f"""You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
+
+Please answer the following query using chain of thought reasoning:
+
+Query: {query}
+
+Let's think about this step by step:"""
+        else:
+            prompt = f"""You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
+
+Query: {query}
+
+Answer:"""
+        
+        logger.info("Generating response using local model...")
+        response = self._generate_text(prompt, max_length=1024)
+        logger.info("Response generation complete")
+        
+        return {
+            "answer": "I didn't find specific information in my documents, but here's what I know about it:\n\n" + response,
+            "context": []
+        }
+
 def main():
     parser = argparse.ArgumentParser(description="Query documents using local Mistral model")
     parser.add_argument("--query", required=True, help="Query to process")
diff --git a/agentic_rag/pdf_processor.py b/agentic_rag/pdf_processor.py
@@ -30,7 +30,7 @@ def __init__(self, tokenizer: str = "BAAI/bge-small-en-v1.5"):
         warnings.filterwarnings('ignore', category=UserWarning, module='transformers.modeling_utils')
         
         self.converter = DocumentConverter()
-        self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=256)  # Reduced chunk size for token length
+        self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=200)  # Further reduced chunk size
     
     def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
         """Safely extract metadata from various object types"""
@@ -60,6 +60,15 @@ def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
                 "page_numbers": []
             }
     
+    def _try_chunk_with_size(self, document: Any, chunk_size: int) -> List[Any]:
+        """Try chunking with a specific size, return None if it fails"""
+        try:
+            self.chunker.max_chunk_size = chunk_size
+            return list(self.chunker.chunk(document))
+        except Exception as e:
+            print(f"Warning: Chunking failed with size {chunk_size}: {str(e)}")
+            return None
+
     def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
         """Process a PDF file and return chunks of text with metadata"""
         try:
@@ -71,14 +80,16 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
             if not conv_result or not conv_result.document:
                 raise ValueError(f"Failed to convert PDF: {file_path}")
             
-            # Chunk the document with error handling
-            try:
-                chunks = list(self.chunker.chunk(conv_result.document))
-            except Exception as chunk_error:
-                print(f"Warning: Error during chunking: {str(chunk_error)}")
-                # Fallback to smaller chunk size if needed
-                self.chunker.max_chunk_size = 128
-                chunks = list(self.chunker.chunk(conv_result.document))
+            # Try chunking with progressively smaller sizes
+            chunks = None
+            for chunk_size in [200, 150, 100, 75]:
+                chunks = self._try_chunk_with_size(conv_result.document, chunk_size)
+                if chunks:
+                    print(f"Successfully chunked with size {chunk_size}")
+                    break
+            
+            if not chunks:
+                raise ValueError("Failed to chunk document with any chunk size")
             
             # Process chunks into a standardized format
             processed_chunks = []
diff --git a/agentic_rag/rag_agent.py b/agentic_rag/rag_agent.py
@@ -58,29 +58,21 @@ def process_query(self, query: str) -> Dict[str, Any]:
         # Analyze the query
         analysis = self._analyze_query(query)
         
-        # If query type is unsupported, return early
+        # If query type is unsupported, use general knowledge
         if analysis.query_type == "unsupported":
-            return {
-                "answer": "I apologize, but I don't have the information to answer this query.",
-                "reasoning": analysis.reasoning,
-                "context": []
-            }
+            return self._generate_general_response(query)
         
         # Retrieve relevant context based on query type
         if analysis.query_type == "pdf_documents":
             context = self.vector_store.query_pdf_collection(query)
         else:
             context = self.vector_store.query_general_collection(query)
         
-        # Generate response using context
+        # Generate response using context if available, otherwise use general knowledge
         if context and analysis.requires_context:
             response = self._generate_response(query, context)
         else:
-            response = {
-                "answer": "I couldn't find relevant information to answer your query.",
-                "reasoning": analysis.reasoning,
-                "context": []
-            }
+            response = self._generate_general_response(query)
         
         return response
     
@@ -128,6 +120,32 @@ def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[
             "context": context
         }
 
+    def _generate_general_response(self, query: str) -> Dict[str, Any]:
+        """Generate a response using general knowledge when no context is available"""
+        if self.use_cot:
+            template = """You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
+
+Please answer the following query using chain of thought reasoning:
+
+Query: {query}
+
+Let's think about this step by step:"""
+        else:
+            template = """You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
+
+Query: {query}
+
+Answer:"""
+        
+        prompt = ChatPromptTemplate.from_template(template)
+        messages = prompt.format_messages(query=query)
+        response = self.llm.invoke(messages)
+        
+        return {
+            "answer": "I didn't find specific information in my documents, but here's what I know about it:\n\n" + response.content,
+            "context": []
+        }
+
 def main():
     parser = argparse.ArgumentParser(description="Query documents using OpenAI GPT-4")
     parser.add_argument("--query", required=True, help="Query to process")
diff --git a/agentic_rag/requirements.txt b/agentic_rag/requirements.txt
@@ -13,4 +13,5 @@ accelerate
 pyyaml
 trafilatura
 gradio
-lxml_html_clean
+lxml_html_clean
+langchain