Skip to content

Commit 3e3e3e3

Browse files
committed
feat: added web knowledge base
1 parent d3261e3 commit 3e3e3e3

File tree

4 files changed

+122
-120
lines changed

4 files changed

+122
-120
lines changed

agentic_rag/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ python main.py
116116
117117
The API will be available at `http://localhost:8000`. You can then use the API endpoints as described in the API Endpoints section below.
118118
119-
### 2. Using the Gradio Interface
119+
### 2. Using the Gradio Interface (Recommended)
120120
121121
The system provides a user-friendly web interface using Gradio, which allows you to:
122122
- Upload and process PDF documents

agentic_rag/gradio_app.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,17 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool,
9191
# Skip analysis for General Knowledge or when using standard chat interface (not CoT)
9292
skip_analysis = collection == "General Knowledge" or not use_cot
9393

94+
# Map collection names to actual collection names in vector store
95+
collection_mapping = {
96+
"PDF Collection": "pdf_documents",
97+
"Repository Collection": "repository_documents",
98+
"Web Knowledge Base": "web_documents",
99+
"General Knowledge": "general_knowledge"
100+
}
101+
102+
# Get the actual collection name
103+
actual_collection = collection_mapping.get(collection, "pdf_documents")
104+
94105
# Parse agent type to determine model and quantization
95106
quantization = None
96107
model_name = None
@@ -365,15 +376,17 @@ def create_interface():
365376
)
366377
with gr.Column(scale=1):
367378
standard_collection_dropdown = gr.Dropdown(
368-
choices=["PDF Collection", "Repository Collection", "General Knowledge"],
379+
choices=["PDF Collection", "Repository Collection", "Web Knowledge Base", "General Knowledge"],
369380
value="PDF Collection",
370-
label="Knowledge Collection"
381+
label="Select Knowledge Base",
382+
info="Choose which knowledge base to use for answering questions"
371383
)
372384
gr.Markdown("""
373385
> **Collection Selection**:
374386
> - This interface ALWAYS uses the selected collection without performing query analysis.
375387
> - "PDF Collection": Will ALWAYS search the PDF documents regardless of query type.
376388
> - "Repository Collection": Will ALWAYS search the repository code regardless of query type.
389+
> - "Web Knowledge Base": Will ALWAYS search the web content regardless of query type.
377390
> - "General Knowledge": Will ALWAYS use the model's built-in knowledge without searching collections.
378391
""")
379392
standard_chatbot = gr.Chatbot(height=400)
@@ -393,15 +406,17 @@ def create_interface():
393406
)
394407
with gr.Column(scale=1):
395408
cot_collection_dropdown = gr.Dropdown(
396-
choices=["PDF Collection", "Repository Collection", "General Knowledge"],
409+
choices=["PDF Collection", "Repository Collection", "Web Knowledge Base", "General Knowledge"],
397410
value="PDF Collection",
398-
label="Knowledge Collection"
411+
label="Select Knowledge Base",
412+
info="Choose which knowledge base to use for answering questions"
399413
)
400414
gr.Markdown("""
401415
> **Collection Selection**:
402416
> - When a specific collection is selected, the system will ALWAYS use that collection without analysis:
403417
> - "PDF Collection": Will ALWAYS search the PDF documents.
404418
> - "Repository Collection": Will ALWAYS search the repository code.
419+
> - "Web Knowledge Base": Will ALWAYS search the web content.
405420
> - "General Knowledge": Will ALWAYS use the model's built-in knowledge.
406421
> - This interface shows step-by-step reasoning and may perform query analysis when needed.
407422
""")
@@ -485,6 +500,7 @@ def create_interface():
485500
- Select which knowledge collection to query:
486501
- **PDF Collection**: Always searches PDF documents
487502
- **Repository Collection**: Always searches code repositories
503+
- **Web Knowledge Base**: Always searches web content
488504
- **General Knowledge**: Uses the model's built-in knowledge without searching collections
489505
490506
3. **Chain of Thought Chat Interface**:

agentic_rag/local_rag_agent.py

Lines changed: 66 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -245,126 +245,91 @@ def process_query(self, query: str) -> Dict[str, Any]:
245245
else:
246246
return self._generate_general_response(query)
247247
else:
248-
# For PDF or Repository collections, use context-based processing
248+
# For PDF, Repository, or Web collections, use context-based processing
249249
if self.use_cot:
250250
return self._process_query_with_cot(query)
251251
else:
252252
return self._process_query_standard(query)
253253

254254
def _process_query_with_cot(self, query: str) -> Dict[str, Any]:
255-
"""Process query using Chain of Thought reasoning with multiple agents"""
256-
logger.info("Processing query with Chain of Thought reasoning")
257-
258-
# Get initial context based on selected collection
259-
initial_context = []
260-
if self.collection == "PDF Collection":
261-
logger.info(f"Retrieving context from PDF Collection for query: '{query}'")
262-
pdf_context = self.vector_store.query_pdf_collection(query)
263-
initial_context.extend(pdf_context)
264-
logger.info(f"Retrieved {len(pdf_context)} chunks from PDF Collection")
265-
# Don't log individual sources to keep console clean
266-
elif self.collection == "Repository Collection":
267-
logger.info(f"Retrieving context from Repository Collection for query: '{query}'")
268-
repo_context = self.vector_store.query_repo_collection(query)
269-
initial_context.extend(repo_context)
270-
logger.info(f"Retrieved {len(repo_context)} chunks from Repository Collection")
271-
# Don't log individual sources to keep console clean
272-
# For General Knowledge, no context is needed
273-
else:
274-
logger.info("Using General Knowledge collection, no context retrieval needed")
275-
255+
"""Process query using Chain of Thought reasoning"""
276256
try:
277-
# Step 1: Planning
278-
logger.info("Step 1: Planning")
279-
if not self.agents or "planner" not in self.agents:
280-
logger.warning("No planner agent available, using direct response")
281-
return self._generate_general_response(query)
257+
# Get context based on collection type
258+
if self.collection == "PDF Collection":
259+
context = self.vector_store.query_pdf_collection(query)
260+
elif self.collection == "Repository Collection":
261+
context = self.vector_store.query_repo_collection(query)
262+
elif self.collection == "Web Knowledge Base":
263+
context = self.vector_store.query_web_collection(query)
264+
else:
265+
context = []
282266

283-
plan = self.agents["planner"].plan(query, initial_context)
284-
logger.info(f"Generated plan:\n{plan}")
267+
# Log number of chunks retrieved
268+
logger.info(f"Retrieved {len(context)} chunks from {self.collection}")
285269

286-
# Step 2: Research each step (if researcher is available)
287-
logger.info("Step 2: Research")
288-
research_results = []
289-
if self.agents.get("researcher") is not None and initial_context:
290-
for step in plan.split("\n"):
291-
if not step.strip():
292-
continue
293-
step_research = self.agents["researcher"].research(query, step)
294-
research_results.append({"step": step, "findings": step_research})
295-
# Don't log source indices to keep console clean
296-
logger.info(f"Research for step: {step}")
297-
else:
298-
# If no researcher or no context, use the steps directly
299-
research_results = [{"step": step, "findings": []} for step in plan.split("\n") if step.strip()]
300-
logger.info("No research performed (no researcher agent or no context available)")
270+
# Create agents if not already created
271+
if not self.agents:
272+
self.agents = create_agents(self.llm, self.vector_store)
301273

302-
# Step 3: Reasoning about each step
303-
logger.info("Step 3: Reasoning")
304-
if not self.agents.get("reasoner"):
305-
logger.warning("No reasoner agent available, using direct response")
306-
return self._generate_general_response(query)
274+
# Get planning step
275+
planning_result = self.agents["planner"].plan(query, context)
276+
logger.info("Planning step completed")
307277

308-
reasoning_steps = []
309-
for result in research_results:
310-
step_reasoning = self.agents["reasoner"].reason(
311-
query,
312-
result["step"],
313-
result["findings"] if result["findings"] else [{"content": "Using general knowledge", "metadata": {"source": "General Knowledge"}}]
314-
)
315-
reasoning_steps.append(step_reasoning)
316-
# Log just the step, not the full reasoning
317-
logger.info(f"Reasoning for step: {result['step']}")
278+
# Get research step
279+
research_result = self.agents["researcher"].research(query, context)
280+
logger.info("Research step completed")
318281

319-
# Step 4: Synthesize final answer
320-
logger.info("Step 4: Synthesis")
321-
if not self.agents.get("synthesizer"):
322-
logger.warning("No synthesizer agent available, using direct response")
323-
return self._generate_general_response(query)
282+
# Get reasoning step
283+
reasoning_result = self.agents["reasoner"].reason(query, research_result["context"])
284+
logger.info("Reasoning step completed")
324285

325-
final_answer = self.agents["synthesizer"].synthesize(query, reasoning_steps)
326-
logger.info("Final answer synthesized successfully")
286+
# Get synthesis step
287+
synthesis_result = self.agents["synthesizer"].synthesize(
288+
query,
289+
planning_result["context"],
290+
research_result["context"],
291+
reasoning_result["context"]
292+
)
293+
logger.info("Synthesis step completed")
327294

328295
return {
329-
"answer": final_answer,
330-
"context": initial_context,
331-
"reasoning_steps": reasoning_steps
296+
"answer": synthesis_result["answer"],
297+
"reasoning_steps": [
298+
planning_result["answer"],
299+
research_result["answer"],
300+
reasoning_result["answer"],
301+
synthesis_result["answer"]
302+
],
303+
"context": synthesis_result["context"]
332304
}
305+
333306
except Exception as e:
334307
logger.error(f"Error in CoT processing: {str(e)}")
335-
logger.info("Falling back to general response")
336-
return self._generate_general_response(query)
308+
raise
337309

338310
def _process_query_standard(self, query: str) -> Dict[str, Any]:
339-
"""Process query using standard approach without Chain of Thought"""
340-
# Initialize context variables
341-
pdf_context = []
342-
repo_context = []
343-
344-
# Get context based on selected collection
345-
if self.collection == "PDF Collection":
346-
logger.info(f"Retrieving context from PDF Collection for query: '{query}'")
347-
pdf_context = self.vector_store.query_pdf_collection(query)
348-
logger.info(f"Retrieved {len(pdf_context)} chunks from PDF Collection")
349-
# Don't log individual sources to keep console clean
350-
elif self.collection == "Repository Collection":
351-
logger.info(f"Retrieving context from Repository Collection for query: '{query}'")
352-
repo_context = self.vector_store.query_repo_collection(query)
353-
logger.info(f"Retrieved {len(repo_context)} chunks from Repository Collection")
354-
# Don't log individual sources to keep console clean
355-
356-
# Combine all context
357-
all_context = pdf_context + repo_context
358-
359-
# Generate response using context if available, otherwise use general knowledge
360-
if all_context:
361-
logger.info(f"Generating response using {len(all_context)} context chunks")
362-
response = self._generate_response(query, all_context)
363-
else:
364-
logger.info("No context found, using general knowledge")
365-
response = self._generate_general_response(query)
366-
367-
return response
311+
"""Process query using standard RAG approach"""
312+
try:
313+
# Get context based on collection type
314+
if self.collection == "PDF Collection":
315+
context = self.vector_store.query_pdf_collection(query)
316+
elif self.collection == "Repository Collection":
317+
context = self.vector_store.query_repo_collection(query)
318+
elif self.collection == "Web Knowledge Base":
319+
context = self.vector_store.query_web_collection(query)
320+
else:
321+
context = []
322+
323+
# Log number of chunks retrieved
324+
logger.info(f"Retrieved {len(context)} chunks from {self.collection}")
325+
326+
# Generate response using context
327+
response = self._generate_response(query, context)
328+
return response
329+
330+
except Exception as e:
331+
logger.error(f"Error in standard processing: {str(e)}")
332+
raise
368333

369334
def _generate_text(self, prompt: str, max_length: int = 512) -> str:
370335
"""Generate text using the local model"""
@@ -456,7 +421,7 @@ def main():
456421
parser.add_argument("--model", default="mistralai/Mistral-7B-Instruct-v0.2", help="Model to use")
457422
parser.add_argument("--quiet", action="store_true", help="Disable verbose logging")
458423
parser.add_argument("--use-cot", action="store_true", help="Enable Chain of Thought reasoning")
459-
parser.add_argument("--collection", choices=["PDF Collection", "Repository Collection", "General Knowledge"],
424+
parser.add_argument("--collection", choices=["PDF Collection", "Repository Collection", "General Knowledge", "Web Knowledge Base"],
460425
help="Specify which collection to query")
461426
parser.add_argument("--skip-analysis", action="store_true", help="Skip query analysis step")
462427
parser.add_argument("--verbose", action="store_true", help="Show full content of sources")

0 commit comments

Comments
 (0)