Skip to content

Commit 5d2d6f3

Browse files
committed
feat: add repository processing and language fix - Add RepoProcessor for GitHub repositories - Update VectorStore with repository collection - Enhance RAG agents to use repository context - Fix language selection in Gradio app
1 parent 314c7d6 commit 5d2d6f3

File tree

5 files changed

+192
-22
lines changed

5 files changed

+192
-22
lines changed

agentic_rag/gradio_app.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,12 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
6868
if not agent:
6969
return history + [[message, "Agent not available. Please check your configuration."]]
7070

71+
# Convert language selection to language code
72+
lang_code = "es" if language == "Spanish" else "en"
73+
7174
# Set CoT option and language
7275
agent.use_cot = use_cot
73-
agent.language = language
76+
agent.language = lang_code
7477

7578
# Process query
7679
response = agent.process_query(message)

agentic_rag/local_rag_agent.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -165,27 +165,40 @@ def process_query(self, query: str) -> Dict[str, Any]:
165165

166166
# First try to get context from PDF documents
167167
logger.info("Querying PDF collection...")
168-
context = self.vector_store.query_pdf_collection(query)
169-
logger.info(f"Retrieved {len(context)} context chunks")
168+
pdf_context = self.vector_store.query_pdf_collection(query)
169+
logger.info(f"Retrieved {len(pdf_context)} PDF context chunks")
170170

171-
if context:
172-
# If we found relevant PDF context, use it
173-
for i, ctx in enumerate(context):
171+
# Then try repository documents
172+
logger.info("Querying repository collection...")
173+
repo_context = self.vector_store.query_repo_collection(query)
174+
logger.info(f"Retrieved {len(repo_context)} repository context chunks")
175+
176+
# Combine and sort context by relevance
177+
all_context = pdf_context + repo_context
178+
179+
if all_context:
180+
# Log context sources
181+
for i, ctx in enumerate(all_context):
174182
source = ctx["metadata"].get("source", "Unknown")
175-
pages = ctx["metadata"].get("page_numbers", [])
176-
logger.info(f"Context chunk {i+1}:")
177-
logger.info(f"- Source: {source}")
178-
logger.info(f"- Pages: {pages}")
183+
if "page_numbers" in ctx["metadata"]:
184+
pages = ctx["metadata"].get("page_numbers", [])
185+
logger.info(f"Context chunk {i+1} (PDF):")
186+
logger.info(f"- Source: {source}")
187+
logger.info(f"- Pages: {pages}")
188+
else:
189+
file_path = ctx["metadata"].get("file_path", "Unknown")
190+
logger.info(f"Context chunk {i+1} (Repository):")
191+
logger.info(f"- Source: {source}")
192+
logger.info(f"- File: {file_path}")
179193
logger.info(f"- Content preview: {ctx['content'][:100]}...")
180194

181-
logger.info("Generating response with PDF context...")
182-
response = self._generate_response(query, context)
195+
logger.info("Generating response with context...")
196+
response = self._generate_response(query, all_context)
183197
logger.info("Response generated successfully")
184198
return response
185199

186-
# If no PDF context found or if it's a general knowledge query,
187-
# use general knowledge
188-
logger.info("No relevant PDF context found or general knowledge query detected")
200+
# If no context found, use general knowledge
201+
logger.info("No relevant context found")
189202
logger.info("Using general knowledge response...")
190203
return self._generate_general_response(query)
191204

agentic_rag/rag_agent.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,18 @@ def process_query(self, query: str) -> Dict[str, Any]:
6363
if analysis.query_type == "unsupported":
6464
return self._generate_general_response(query)
6565

66-
# Retrieve relevant context based on query type
67-
if analysis.query_type == "pdf_documents":
68-
context = self.vector_store.query_pdf_collection(query)
69-
else:
70-
context = self.vector_store.query_general_collection(query)
66+
# First try to get context from PDF documents
67+
pdf_context = self.vector_store.query_pdf_collection(query)
68+
69+
# Then try repository documents
70+
repo_context = self.vector_store.query_repo_collection(query)
71+
72+
# Combine all context
73+
all_context = pdf_context + repo_context
7174

7275
# Generate response using context if available, otherwise use general knowledge
73-
if context and analysis.requires_context:
74-
response = self._generate_response(query, context)
76+
if all_context and analysis.requires_context:
77+
response = self._generate_response(query, all_context)
7578
else:
7679
response = self._generate_general_response(query)
7780

agentic_rag/repo_processor.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from pathlib import Path
2+
from typing import List, Dict, Any, Tuple
3+
import json
4+
import argparse
5+
from urllib.parse import urlparse
6+
import warnings
7+
import uuid
8+
from gitingest import ingest
9+
10+
def is_github_url(url: str) -> bool:
11+
"""Check if a string is a valid GitHub URL"""
12+
try:
13+
parsed = urlparse(url)
14+
return parsed.netloc.lower() == "github.com"
15+
except:
16+
return False
17+
18+
class RepoProcessor:
19+
def __init__(self):
20+
"""Initialize repository processor"""
21+
pass
22+
23+
def _extract_metadata(self, summary: Dict[str, Any], tree: Dict[str, Any]) -> Dict[str, Any]:
24+
"""Extract metadata from repository summary and tree"""
25+
return {
26+
"repo_name": summary.get("name", ""),
27+
"description": summary.get("description", ""),
28+
"language": summary.get("language", ""),
29+
"topics": summary.get("topics", []),
30+
"stars": summary.get("stars", 0),
31+
"forks": summary.get("forks", 0),
32+
"last_updated": summary.get("updated_at", ""),
33+
"file_count": len(tree) if tree else 0
34+
}
35+
36+
def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str]:
37+
"""Process a repository and return chunks of content with metadata"""
38+
try:
39+
# Generate a unique document ID
40+
document_id = str(uuid.uuid4())
41+
42+
# Check if it's a GitHub URL
43+
if isinstance(repo_path, str) and is_github_url(repo_path):
44+
print(f"Processing GitHub repository: {repo_path}")
45+
else:
46+
print(f"Processing local repository: {repo_path}")
47+
48+
# Ingest repository
49+
summary, tree, content = ingest(str(repo_path))
50+
51+
# Print repository information
52+
print("\nRepository Summary:")
53+
print(json.dumps(summary, indent=2))
54+
print("\nFile Tree:")
55+
print(json.dumps(tree, indent=2))
56+
57+
# Extract metadata
58+
metadata = self._extract_metadata(summary, tree)
59+
60+
# Process content into chunks
61+
processed_chunks = []
62+
for file_path, file_content in content.items():
63+
chunk = {
64+
"text": file_content,
65+
"metadata": {
66+
**metadata,
67+
"file_path": file_path,
68+
"source": str(repo_path),
69+
"document_id": document_id
70+
}
71+
}
72+
processed_chunks.append(chunk)
73+
74+
return processed_chunks, document_id
75+
76+
except Exception as e:
77+
raise Exception(f"Error processing repository {repo_path}: {str(e)}")
78+
79+
def main():
80+
parser = argparse.ArgumentParser(description="Process GitHub repositories and extract content")
81+
parser.add_argument("--input", required=True,
82+
help="Input repository path or GitHub URL")
83+
parser.add_argument("--output", required=True, help="Output JSON file for chunks")
84+
85+
args = parser.parse_args()
86+
processor = RepoProcessor()
87+
88+
try:
89+
# Create output directory if it doesn't exist
90+
output_dir = Path(args.output).parent
91+
output_dir.mkdir(parents=True, exist_ok=True)
92+
93+
print(f"\nProcessing repository: {args.input}")
94+
print("=" * 50)
95+
96+
chunks, doc_id = processor.process_repo(args.input)
97+
98+
# Save chunks to JSON
99+
with open(args.output, 'w', encoding='utf-8') as f:
100+
json.dump(chunks, f, ensure_ascii=False, indent=2)
101+
102+
print("\nSummary:")
103+
print(f"✓ Processed {len(chunks)} chunks")
104+
print(f"✓ Document ID: {doc_id}")
105+
print(f"✓ Saved to {args.output}")
106+
107+
except Exception as e:
108+
print(f"\n✗ Error: {str(e)}")
109+
exit(1)
110+
111+
if __name__ == "__main__":
112+
main()

agentic_rag/store.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def __init__(self, persist_directory: str = "embeddings"):
2121
name="web_documents",
2222
metadata={"hnsw:space": "cosine"}
2323
)
24+
self.repo_collection = self.client.get_or_create_collection(
25+
name="repository_documents",
26+
metadata={"hnsw:space": "cosine"}
27+
)
2428
self.general_collection = self.client.get_or_create_collection(
2529
name="general_knowledge",
2630
metadata={"hnsw:space": "cosine"}
@@ -94,6 +98,23 @@ def add_general_knowledge(self, chunks: List[Dict[str, Any]], source_id: str):
9498
ids=ids
9599
)
96100

101+
def add_repo_chunks(self, chunks: List[Dict[str, Any]], document_id: str):
102+
"""Add chunks from a repository to the vector store"""
103+
if not chunks:
104+
return
105+
106+
# Prepare data for ChromaDB
107+
texts = [chunk["text"] for chunk in chunks]
108+
metadatas = [self._sanitize_metadata(chunk["metadata"]) for chunk in chunks]
109+
ids = [f"{document_id}_{i}" for i in range(len(chunks))]
110+
111+
# Add to collection
112+
self.repo_collection.add(
113+
documents=texts,
114+
metadatas=metadatas,
115+
ids=ids
116+
)
117+
97118
def query_pdf_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]:
98119
"""Query the PDF documents collection"""
99120
results = self.pdf_collection.query(
@@ -147,6 +168,24 @@ def query_general_collection(self, query: str, n_results: int = 3) -> List[Dict[
147168
formatted_results.append(result)
148169

149170
return formatted_results
171+
172+
def query_repo_collection(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]:
173+
"""Query the repository documents collection"""
174+
results = self.repo_collection.query(
175+
query_texts=[query],
176+
n_results=n_results
177+
)
178+
179+
# Format results
180+
formatted_results = []
181+
for i in range(len(results["documents"][0])):
182+
result = {
183+
"content": results["documents"][0][i],
184+
"metadata": results["metadatas"][0][i]
185+
}
186+
formatted_results.append(result)
187+
188+
return formatted_results
150189

151190
def main():
152191
parser = argparse.ArgumentParser(description="Manage vector store")

0 commit comments

Comments
 (0)