Skip to content

Commit a106d83

Browse files
committed
feat(agentic_rag): Enhance reository processing and collection-specific querying - Add repo processing to Gradio, add collection-specific querying, simplify metadata, improve chunking
1 parent 7e1628b commit a106d83

File tree

3 files changed

+127
-75
lines changed

3 files changed

+127
-75
lines changed

agentic_rag/gradio_app.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from pdf_processor import PDFProcessor
1010
from web_processor import WebProcessor
11+
from repo_processor import RepoProcessor
1112
from store import VectorStore
1213
from local_rag_agent import LocalRAGAgent
1314
from rag_agent import RAGAgent
@@ -28,6 +29,7 @@ def load_config():
2829
# Initialize components
2930
pdf_processor = PDFProcessor()
3031
web_processor = WebProcessor()
32+
repo_processor = RepoProcessor()
3133
vector_store = VectorStore()
3234

3335
# Initialize agents
@@ -60,8 +62,22 @@ def process_url(url: str) -> str:
6062
except Exception as e:
6163
return f"✗ Error processing URL: {str(e)}"
6264

63-
def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, language: str) -> List[List[str]]:
64-
"""Process chat message using selected agent"""
65+
def process_repo(repo_path: str) -> str:
66+
"""Process repository content"""
67+
try:
68+
# Process repository and get chunks
69+
chunks, document_id = repo_processor.process_repo(repo_path)
70+
if not chunks:
71+
return "✗ No content extracted from repository"
72+
73+
# Add chunks to vector store
74+
vector_store.add_repo_chunks(chunks, document_id=document_id)
75+
return f"✓ Successfully processed repository and added {len(chunks)} chunks to knowledge base (ID: {document_id})"
76+
except Exception as e:
77+
return f"✗ Error processing repository: {str(e)}"
78+
79+
def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, language: str, collection: str) -> List[List[str]]:
80+
"""Process chat message using selected agent and collection"""
6581
try:
6682
# Select appropriate agent
6783
agent = local_agent if agent_type == "Local (Mistral)" else openai_agent
@@ -75,8 +91,15 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
7591
agent.use_cot = use_cot
7692
agent.language = lang_code
7793

78-
# Process query
79-
response = agent.process_query(message)
94+
# Process query based on selected collection
95+
if collection == "PDF Collection":
96+
context = vector_store.query_pdf_collection(message)
97+
response = agent._generate_response(message, context) if context else agent._generate_general_response(message)
98+
elif collection == "Repository Collection":
99+
context = vector_store.query_repo_collection(message)
100+
response = agent._generate_response(message, context) if context else agent._generate_general_response(message)
101+
else: # General Knowledge
102+
response = agent._generate_general_response(message)
80103

81104
# Return updated history with new message pair
82105
history.append([message, response["answer"]])
@@ -91,7 +114,7 @@ def create_interface():
91114
gr.Markdown("""
92115
# 🤖 Agentic RAG System
93116
94-
Upload PDFs, process web content, and chat with your documents using local or OpenAI models.
117+
Upload PDFs, process web content, repositories, and chat with your documents using local or OpenAI models.
95118
""")
96119

97120
with gr.Tab("Document Processing"):
@@ -105,6 +128,11 @@ def create_interface():
105128
url_input = gr.Textbox(label="Enter URL")
106129
url_button = gr.Button("Process URL")
107130
url_output = gr.Textbox(label="URL Processing Output")
131+
132+
with gr.Column():
133+
repo_input = gr.Textbox(label="Enter Repository Path or URL")
134+
repo_button = gr.Button("Process Repository")
135+
repo_output = gr.Textbox(label="Repository Processing Output")
108136

109137
with gr.Tab("Chat Interface"):
110138
with gr.Row():
@@ -121,21 +149,29 @@ def create_interface():
121149
value="English",
122150
label="Response Language"
123151
)
152+
with gr.Column():
153+
collection_dropdown = gr.Dropdown(
154+
choices=["PDF Collection", "Repository Collection", "General Knowledge"],
155+
value="PDF Collection",
156+
label="Knowledge Collection"
157+
)
124158
chatbot = gr.Chatbot(height=400)
125159
msg = gr.Textbox(label="Your Message")
126160
clear = gr.Button("Clear Chat")
127161

128162
# Event handlers
129163
pdf_button.click(process_pdf, inputs=[pdf_file], outputs=[pdf_output])
130164
url_button.click(process_url, inputs=[url_input], outputs=[url_output])
165+
repo_button.click(process_repo, inputs=[repo_input], outputs=[repo_output])
131166
msg.submit(
132167
chat,
133168
inputs=[
134169
msg,
135170
chatbot,
136171
agent_dropdown,
137172
cot_checkbox,
138-
language_dropdown
173+
language_dropdown,
174+
collection_dropdown
139175
],
140176
outputs=[chatbot]
141177
)
@@ -148,12 +184,14 @@ def create_interface():
148184
1. **Document Processing**:
149185
- Upload PDFs using the file uploader
150186
- Process web content by entering URLs
187+
- Process repositories by entering paths or GitHub URLs
151188
- All processed content is added to the knowledge base
152189
153190
2. **Chat Interface**:
154191
- Select your preferred agent (Local Mistral or OpenAI)
155192
- Toggle Chain of Thought reasoning for more detailed responses
156193
- Choose your preferred response language (English or Spanish)
194+
- Select which knowledge collection to query
157195
- Chat with your documents using natural language
158196
159197
Note: OpenAI agent requires an API key in `.env` file

agentic_rag/repo_processor.py

Lines changed: 81 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -15,39 +15,68 @@ def is_github_url(url: str) -> bool:
1515
except:
1616
return False
1717

18+
def extract_repo_name(repo_path: str) -> str:
19+
"""Extract repository name from path or URL"""
20+
if is_github_url(repo_path):
21+
# For GitHub URLs, extract owner/repo format
22+
parts = repo_path.rstrip('/').split('/')
23+
if len(parts) >= 5:
24+
return f"{parts[3]}/{parts[4]}" # owner/repo format
25+
26+
# For local paths, use the last directory name
27+
return Path(repo_path).name
28+
1829
class RepoProcessor:
19-
def __init__(self):
20-
"""Initialize repository processor"""
21-
pass
30+
def __init__(self, chunk_size: int = 500):
31+
"""Initialize repository processor with chunk size"""
32+
self.chunk_size = chunk_size
2233

23-
def _extract_metadata(self, summary: Dict[str, Any], tree: Dict[str, Any]) -> Dict[str, Any]:
34+
def _extract_metadata(self, summary: Dict[str, Any], tree: Dict[str, Any], repo_path: str) -> Dict[str, Any]:
2435
"""Extract metadata from repository summary and tree"""
36+
# Extract repo name from path or URL
37+
repo_name = extract_repo_name(repo_path)
38+
2539
# Handle case where summary might be a string
2640
if isinstance(summary, str):
2741
return {
28-
"repo_name": "Unknown",
29-
"description": "",
30-
"language": "",
31-
"topics": [],
32-
"stars": 0,
33-
"forks": 0,
34-
"last_updated": "",
42+
"repo_name": repo_name,
3543
"file_count": len(tree) if tree else 0
3644
}
3745

3846
return {
39-
"repo_name": summary.get("name", ""),
40-
"description": summary.get("description", ""),
41-
"language": summary.get("language", ""),
42-
"topics": summary.get("topics", []),
43-
"stars": summary.get("stars", 0),
44-
"forks": summary.get("forks", 0),
45-
"last_updated": summary.get("updated_at", ""),
47+
"repo_name": repo_name, # Use extracted name instead of summary
4648
"file_count": len(tree) if tree else 0
4749
}
4850

51+
def _chunk_text(self, text: str) -> List[str]:
52+
"""Split text into chunks of roughly equal size"""
53+
# Split into sentences (roughly)
54+
sentences = [s.strip() for s in text.split('.') if s.strip()]
55+
56+
chunks = []
57+
current_chunk = []
58+
current_length = 0
59+
60+
for sentence in sentences:
61+
# Add period back
62+
sentence = sentence + '.'
63+
# If adding this sentence would exceed chunk size, save current chunk
64+
if current_length + len(sentence) > self.chunk_size and current_chunk:
65+
chunks.append(' '.join(current_chunk))
66+
current_chunk = []
67+
current_length = 0
68+
69+
current_chunk.append(sentence)
70+
current_length += len(sentence)
71+
72+
# Add any remaining text
73+
if current_chunk:
74+
chunks.append(' '.join(current_chunk))
75+
76+
return chunks
77+
4978
def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str]:
50-
"""Process a repository and return chunks of content with metadata"""
79+
"""Process a repository and return chunks of text with metadata"""
5180
try:
5281
# Generate a unique document ID
5382
document_id = str(uuid.uuid4())
@@ -61,66 +90,48 @@ def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str
6190
# Ingest repository
6291
summary, tree, content = ingest(str(repo_path))
6392

64-
# Calculate token count based on content type
65-
def estimate_tokens(content: Any) -> int:
66-
if isinstance(content, dict):
67-
# If content is a dictionary of file contents
68-
return int(sum(len(str(c).split()) for c in content.values()) * 1.3)
69-
elif isinstance(content, str):
70-
# If content is a single string
71-
return int(len(content.split()) * 1.3)
72-
else:
73-
# If content is in another format, return 0
74-
return 0
75-
76-
# Print formatted repository information
77-
if isinstance(summary, dict):
78-
repo_name = summary.get("name", "Unknown")
79-
file_count = len(tree) if tree else 0
80-
else:
81-
repo_name = str(repo_path).split('/')[-1]
82-
file_count = len(tree) if tree else 0
83-
84-
token_count = estimate_tokens(content)
85-
86-
print("\nRepository Information:")
87-
print("-" * 50)
88-
print(f"📦 Repository: {repo_name}")
89-
print(f"📄 Files analyzed: {file_count}")
90-
print(f"🔤 Estimated tokens: {token_count:,}")
91-
9293
# Extract metadata
93-
metadata = self._extract_metadata(summary, tree)
94+
metadata = self._extract_metadata(summary, tree, str(repo_path))
9495

9596
# Process content into chunks
9697
processed_chunks = []
98+
chunk_id = 0
9799

98100
if isinstance(content, dict):
99101
# Handle dictionary of file contents
100102
for file_path, file_content in content.items():
101-
if isinstance(file_content, str):
102-
chunk = {
103-
"text": file_content,
104-
"metadata": {
105-
**metadata,
106-
"file_path": file_path,
107-
"source": str(repo_path),
108-
"document_id": document_id
103+
if isinstance(file_content, str) and file_content.strip(): # Only process non-empty content
104+
# Split content into chunks
105+
text_chunks = self._chunk_text(file_content)
106+
107+
for text_chunk in text_chunks:
108+
chunk = {
109+
"text": text_chunk,
110+
"metadata": {
111+
**metadata,
112+
"source": str(repo_path),
113+
"document_id": document_id,
114+
"chunk_id": chunk_id
115+
}
109116
}
110-
}
111-
processed_chunks.append(chunk)
117+
processed_chunks.append(chunk)
118+
chunk_id += 1
112119
elif isinstance(content, str):
113120
# Handle single string content
114-
chunk = {
115-
"text": content,
116-
"metadata": {
117-
**metadata,
118-
"file_path": "repository_content.txt",
119-
"source": str(repo_path),
120-
"document_id": document_id
121+
text_chunks = self._chunk_text(content)
122+
123+
for text_chunk in text_chunks:
124+
chunk = {
125+
"text": text_chunk,
126+
"metadata": {
127+
**metadata,
128+
"source": str(repo_path),
129+
"document_id": document_id,
130+
"chunk_id": chunk_id
131+
}
121132
}
122-
}
123-
processed_chunks.append(chunk)
133+
processed_chunks.append(chunk)
134+
chunk_id += 1
124135

125136
return processed_chunks, document_id
126137

@@ -132,9 +143,11 @@ def main():
132143
parser.add_argument("--input", required=True,
133144
help="Input repository path or GitHub URL")
134145
parser.add_argument("--output", required=True, help="Output JSON file for chunks")
146+
parser.add_argument("--chunk-size", type=int, default=500,
147+
help="Maximum size of text chunks")
135148

136149
args = parser.parse_args()
137-
processor = RepoProcessor()
150+
processor = RepoProcessor(chunk_size=args.chunk_size)
138151

139152
try:
140153
# Create output directory if it doesn't exist

agentic_rag/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ pyyaml
1414
trafilatura
1515
gradio
1616
lxml_html_clean
17-
langchain
17+
langchain
18+
gitingest

0 commit comments

Comments
 (0)