feat(agentic_rag): Enhance reository processing and collection-specific querying - Add repo processing to Gradio, add collection-specific querying, simplify metadata, improve chunking

jasperan · jasperan · commit a106d8309b6f · 2025-02-20T00:31:07.000+01:00
diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py
@@ -8,6 +8,7 @@
 
 from pdf_processor import PDFProcessor
 from web_processor import WebProcessor
+from repo_processor import RepoProcessor
 from store import VectorStore
 from local_rag_agent import LocalRAGAgent
 from rag_agent import RAGAgent
@@ -28,6 +29,7 @@ def load_config():
 # Initialize components
 pdf_processor = PDFProcessor()
 web_processor = WebProcessor()
+repo_processor = RepoProcessor()
 vector_store = VectorStore()
 
 # Initialize agents
@@ -60,8 +62,22 @@ def process_url(url: str) -> str:
     except Exception as e:
         return f"✗ Error processing URL: {str(e)}"
 
-def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, language: str) -> List[List[str]]:
-    """Process chat message using selected agent"""
+def process_repo(repo_path: str) -> str:
+    """Process repository content"""
+    try:
+        # Process repository and get chunks
+        chunks, document_id = repo_processor.process_repo(repo_path)
+        if not chunks:
+            return "✗ No content extracted from repository"
+            
+        # Add chunks to vector store
+        vector_store.add_repo_chunks(chunks, document_id=document_id)
+        return f"✓ Successfully processed repository and added {len(chunks)} chunks to knowledge base (ID: {document_id})"
+    except Exception as e:
+        return f"✗ Error processing repository: {str(e)}"
+
+def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, language: str, collection: str) -> List[List[str]]:
+    """Process chat message using selected agent and collection"""
     try:
         # Select appropriate agent
         agent = local_agent if agent_type == "Local (Mistral)" else openai_agent
@@ -75,8 +91,15 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
         agent.use_cot = use_cot
         agent.language = lang_code
         
-        # Process query
-        response = agent.process_query(message)
+        # Process query based on selected collection
+        if collection == "PDF Collection":
+            context = vector_store.query_pdf_collection(message)
+            response = agent._generate_response(message, context) if context else agent._generate_general_response(message)
+        elif collection == "Repository Collection":
+            context = vector_store.query_repo_collection(message)
+            response = agent._generate_response(message, context) if context else agent._generate_general_response(message)
+        else:  # General Knowledge
+            response = agent._generate_general_response(message)
         
         # Return updated history with new message pair
         history.append([message, response["answer"]])
@@ -91,7 +114,7 @@ def create_interface():
         gr.Markdown("""
         # 🤖 Agentic RAG System
         
-        Upload PDFs, process web content, and chat with your documents using local or OpenAI models.
+        Upload PDFs, process web content, repositories, and chat with your documents using local or OpenAI models.
         """)
         
         with gr.Tab("Document Processing"):
@@ -105,6 +128,11 @@ def create_interface():
                     url_input = gr.Textbox(label="Enter URL")
                     url_button = gr.Button("Process URL")
                     url_output = gr.Textbox(label="URL Processing Output")
+                    
+                with gr.Column():
+                    repo_input = gr.Textbox(label="Enter Repository Path or URL")
+                    repo_button = gr.Button("Process Repository")
+                    repo_output = gr.Textbox(label="Repository Processing Output")
         
         with gr.Tab("Chat Interface"):
             with gr.Row():
@@ -121,21 +149,29 @@ def create_interface():
                         value="English",
                         label="Response Language"
                     )
+                with gr.Column():
+                    collection_dropdown = gr.Dropdown(
+                        choices=["PDF Collection", "Repository Collection", "General Knowledge"],
+                        value="PDF Collection",
+                        label="Knowledge Collection"
+                    )
             chatbot = gr.Chatbot(height=400)
             msg = gr.Textbox(label="Your Message")
             clear = gr.Button("Clear Chat")
         
         # Event handlers
         pdf_button.click(process_pdf, inputs=[pdf_file], outputs=[pdf_output])
         url_button.click(process_url, inputs=[url_input], outputs=[url_output])
+        repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output])
         msg.submit(
             chat,
             inputs=[
                 msg,
                 chatbot,
                 agent_dropdown,
                 cot_checkbox,
-                language_dropdown
+                language_dropdown,
+                collection_dropdown
             ],
             outputs=[chatbot]
         )
@@ -148,12 +184,14 @@ def create_interface():
         1. **Document Processing**:
            - Upload PDFs using the file uploader
            - Process web content by entering URLs
+           - Process repositories by entering paths or GitHub URLs
            - All processed content is added to the knowledge base
         
         2. **Chat Interface**:
            - Select your preferred agent (Local Mistral or OpenAI)
            - Toggle Chain of Thought reasoning for more detailed responses
            - Choose your preferred response language (English or Spanish)
+           - Select which knowledge collection to query
            - Chat with your documents using natural language
         
         Note: OpenAI agent requires an API key in `.env` file
diff --git a/agentic_rag/repo_processor.py b/agentic_rag/repo_processor.py
@@ -15,39 +15,68 @@ def is_github_url(url: str) -> bool:
     except:
         return False
 
+def extract_repo_name(repo_path: str) -> str:
+    """Extract repository name from path or URL"""
+    if is_github_url(repo_path):
+        # For GitHub URLs, extract owner/repo format
+        parts = repo_path.rstrip('/').split('/')
+        if len(parts) >= 5:
+            return f"{parts[3]}/{parts[4]}"  # owner/repo format
+    
+    # For local paths, use the last directory name
+    return Path(repo_path).name
+
 class RepoProcessor:
-    def __init__(self):
-        """Initialize repository processor"""
-        pass
+    def __init__(self, chunk_size: int = 500):
+        """Initialize repository processor with chunk size"""
+        self.chunk_size = chunk_size
     
-    def _extract_metadata(self, summary: Dict[str, Any], tree: Dict[str, Any]) -> Dict[str, Any]:
+    def _extract_metadata(self, summary: Dict[str, Any], tree: Dict[str, Any], repo_path: str) -> Dict[str, Any]:
         """Extract metadata from repository summary and tree"""
+        # Extract repo name from path or URL
+        repo_name = extract_repo_name(repo_path)
+        
         # Handle case where summary might be a string
         if isinstance(summary, str):
             return {
-                "repo_name": "Unknown",
-                "description": "",
-                "language": "",
-                "topics": [],
-                "stars": 0,
-                "forks": 0,
-                "last_updated": "",
+                "repo_name": repo_name,
                 "file_count": len(tree) if tree else 0
             }
         
         return {
-            "repo_name": summary.get("name", ""),
-            "description": summary.get("description", ""),
-            "language": summary.get("language", ""),
-            "topics": summary.get("topics", []),
-            "stars": summary.get("stars", 0),
-            "forks": summary.get("forks", 0),
-            "last_updated": summary.get("updated_at", ""),
+            "repo_name": repo_name,  # Use extracted name instead of summary
             "file_count": len(tree) if tree else 0
         }
     
+    def _chunk_text(self, text: str) -> List[str]:
+        """Split text into chunks of roughly equal size"""
+        # Split into sentences (roughly)
+        sentences = [s.strip() for s in text.split('.') if s.strip()]
+        
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        
+        for sentence in sentences:
+            # Add period back
+            sentence = sentence + '.'
+            # If adding this sentence would exceed chunk size, save current chunk
+            if current_length + len(sentence) > self.chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            
+            current_chunk.append(sentence)
+            current_length += len(sentence)
+        
+        # Add any remaining text
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        
+        return chunks
+    
     def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str]:
-        """Process a repository and return chunks of content with metadata"""
+        """Process a repository and return chunks of text with metadata"""
         try:
             # Generate a unique document ID
             document_id = str(uuid.uuid4())
@@ -61,66 +90,48 @@ def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str
             # Ingest repository
             summary, tree, content = ingest(str(repo_path))
             
-            # Calculate token count based on content type
-            def estimate_tokens(content: Any) -> int:
-                if isinstance(content, dict):
-                    # If content is a dictionary of file contents
-                    return int(sum(len(str(c).split()) for c in content.values()) * 1.3)
-                elif isinstance(content, str):
-                    # If content is a single string
-                    return int(len(content.split()) * 1.3)
-                else:
-                    # If content is in another format, return 0
-                    return 0
-            
-            # Print formatted repository information
-            if isinstance(summary, dict):
-                repo_name = summary.get("name", "Unknown")
-                file_count = len(tree) if tree else 0
-            else:
-                repo_name = str(repo_path).split('/')[-1]
-                file_count = len(tree) if tree else 0
-            
-            token_count = estimate_tokens(content)
-            
-            print("\nRepository Information:")
-            print("-" * 50)
-            print(f"📦 Repository: {repo_name}")
-            print(f"📄 Files analyzed: {file_count}")
-            print(f"🔤 Estimated tokens: {token_count:,}")
-            
             # Extract metadata
-            metadata = self._extract_metadata(summary, tree)
+            metadata = self._extract_metadata(summary, tree, str(repo_path))
             
             # Process content into chunks
             processed_chunks = []
+            chunk_id = 0
             
             if isinstance(content, dict):
                 # Handle dictionary of file contents
                 for file_path, file_content in content.items():
-                    if isinstance(file_content, str):
-                        chunk = {
-                            "text": file_content,
-                            "metadata": {
-                                **metadata,
-                                "file_path": file_path,
-                                "source": str(repo_path),
-                                "document_id": document_id
+                    if isinstance(file_content, str) and file_content.strip():  # Only process non-empty content
+                        # Split content into chunks
+                        text_chunks = self._chunk_text(file_content)
+                        
+                        for text_chunk in text_chunks:
+                            chunk = {
+                                "text": text_chunk,
+                                "metadata": {
+                                    **metadata,
+                                    "source": str(repo_path),
+                                    "document_id": document_id,
+                                    "chunk_id": chunk_id
+                                }
                             }
-                        }
-                        processed_chunks.append(chunk)
+                            processed_chunks.append(chunk)
+                            chunk_id += 1
             elif isinstance(content, str):
                 # Handle single string content
-                chunk = {
-                    "text": content,
-                    "metadata": {
-                        **metadata,
-                        "file_path": "repository_content.txt",
-                        "source": str(repo_path),
-                        "document_id": document_id
+                text_chunks = self._chunk_text(content)
+                
+                for text_chunk in text_chunks:
+                    chunk = {
+                        "text": text_chunk,
+                        "metadata": {
+                            **metadata,
+                            "source": str(repo_path),
+                            "document_id": document_id,
+                            "chunk_id": chunk_id
+                        }
                     }
-                }
-                processed_chunks.append(chunk)
+                    processed_chunks.append(chunk)
+                    chunk_id += 1
             
             return processed_chunks, document_id
         
@@ -132,9 +143,11 @@ def main():
     parser.add_argument("--input", required=True, 
                        help="Input repository path or GitHub URL")
     parser.add_argument("--output", required=True, help="Output JSON file for chunks")
+    parser.add_argument("--chunk-size", type=int, default=500,
+                       help="Maximum size of text chunks")
     
     args = parser.parse_args()
-    processor = RepoProcessor()
+    processor = RepoProcessor(chunk_size=args.chunk_size)
     
     try:
         # Create output directory if it doesn't exist
diff --git a/agentic_rag/requirements.txt b/agentic_rag/requirements.txt
@@ -14,4 +14,5 @@ pyyaml
 trafilatura
 gradio
 lxml_html_clean
-langchain
+langchain
+gitingest