Skip to content

Commit dd1ac41

Browse files
committed
feat: enhance RAG agents and PDF processing
1 parent a64f264 commit dd1ac41

File tree

5 files changed

+89
-42
lines changed

5 files changed

+89
-42
lines changed

agentic_rag/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ First, we process a document and query it using the local model. Then, we add th
176176
# 1. Process the PDF
177177
python pdf_processor.py --input example.pdf --output chunks.json
178178
179+
#python pdf_processor.py --input https://arxiv.org/pdf/2203.06605 --output chunks.json
180+
179181
# 2. Add to vector store
180182
python store.py --add chunks.json
181183

agentic_rag/local_rag_agent.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,10 @@ def process_query(self, query: str) -> Dict[str, Any]:
157157
logger.info(f"- Requires context: {analysis.requires_context}")
158158
logger.info(f"- Reasoning: {analysis.reasoning}")
159159

160-
# If query type is unsupported, return early
160+
# If query type is unsupported, use general knowledge
161161
if analysis.query_type == "unsupported":
162-
logger.warning("Query type is unsupported")
163-
return {
164-
"answer": "I apologize, but I don't have the information to answer this query.",
165-
"reasoning": analysis.reasoning,
166-
"context": []
167-
}
162+
logger.info("Query type is unsupported, using general knowledge...")
163+
return self._generate_general_response(query)
168164

169165
# First try to get context from PDF documents
170166
logger.info("Querying PDF collection...")
@@ -187,19 +183,10 @@ def process_query(self, query: str) -> Dict[str, Any]:
187183
return response
188184

189185
# If no PDF context found or if it's a general knowledge query,
190-
# use the LLM directly
191-
if analysis.query_type == "general_knowledge" or not context:
192-
logger.info("No relevant PDF context found or general knowledge query detected")
193-
logger.info("Falling back to direct LLM response...")
194-
return self._generate_direct_response(query)
195-
196-
# This case should rarely happen, but just in case
197-
logger.warning("No relevant context found and query type is not general knowledge")
198-
return {
199-
"answer": "I couldn't find relevant information to answer your query.",
200-
"reasoning": analysis.reasoning,
201-
"context": []
202-
}
186+
# use general knowledge
187+
logger.info("No relevant PDF context found or general knowledge query detected")
188+
logger.info("Using general knowledge response...")
189+
return self._generate_general_response(query)
203190

204191
def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[str, Any]:
205192
"""Generate a response using the retrieved context"""
@@ -240,6 +227,34 @@ def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[
240227
"context": context
241228
}
242229

230+
def _generate_general_response(self, query: str) -> Dict[str, Any]:
231+
"""Generate a response using general knowledge when no context is available"""
232+
logger.info("Generating general knowledge response...")
233+
234+
if self.use_cot:
235+
prompt = f"""You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
236+
237+
Please answer the following query using chain of thought reasoning:
238+
239+
Query: {query}
240+
241+
Let's think about this step by step:"""
242+
else:
243+
prompt = f"""You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
244+
245+
Query: {query}
246+
247+
Answer:"""
248+
249+
logger.info("Generating response using local model...")
250+
response = self._generate_text(prompt, max_length=1024)
251+
logger.info("Response generation complete")
252+
253+
return {
254+
"answer": "I didn't find specific information in my documents, but here's what I know about it:\n\n" + response,
255+
"context": []
256+
}
257+
243258
def main():
244259
parser = argparse.ArgumentParser(description="Query documents using local Mistral model")
245260
parser.add_argument("--query", required=True, help="Query to process")

agentic_rag/pdf_processor.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, tokenizer: str = "BAAI/bge-small-en-v1.5"):
3030
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.modeling_utils')
3131

3232
self.converter = DocumentConverter()
33-
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=256) # Reduced chunk size for token length
33+
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=200) # Further reduced chunk size
3434

3535
def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
3636
"""Safely extract metadata from various object types"""
@@ -60,6 +60,15 @@ def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
6060
"page_numbers": []
6161
}
6262

63+
def _try_chunk_with_size(self, document: Any, chunk_size: int) -> List[Any]:
64+
"""Try chunking with a specific size, return None if it fails"""
65+
try:
66+
self.chunker.max_chunk_size = chunk_size
67+
return list(self.chunker.chunk(document))
68+
except Exception as e:
69+
print(f"Warning: Chunking failed with size {chunk_size}: {str(e)}")
70+
return None
71+
6372
def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
6473
"""Process a PDF file and return chunks of text with metadata"""
6574
try:
@@ -71,14 +80,16 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
7180
if not conv_result or not conv_result.document:
7281
raise ValueError(f"Failed to convert PDF: {file_path}")
7382

74-
# Chunk the document with error handling
75-
try:
76-
chunks = list(self.chunker.chunk(conv_result.document))
77-
except Exception as chunk_error:
78-
print(f"Warning: Error during chunking: {str(chunk_error)}")
79-
# Fallback to smaller chunk size if needed
80-
self.chunker.max_chunk_size = 128
81-
chunks = list(self.chunker.chunk(conv_result.document))
83+
# Try chunking with progressively smaller sizes
84+
chunks = None
85+
for chunk_size in [200, 150, 100, 75]:
86+
chunks = self._try_chunk_with_size(conv_result.document, chunk_size)
87+
if chunks:
88+
print(f"Successfully chunked with size {chunk_size}")
89+
break
90+
91+
if not chunks:
92+
raise ValueError("Failed to chunk document with any chunk size")
8293

8394
# Process chunks into a standardized format
8495
processed_chunks = []

agentic_rag/rag_agent.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,29 +58,21 @@ def process_query(self, query: str) -> Dict[str, Any]:
5858
# Analyze the query
5959
analysis = self._analyze_query(query)
6060

61-
# If query type is unsupported, return early
61+
# If query type is unsupported, use general knowledge
6262
if analysis.query_type == "unsupported":
63-
return {
64-
"answer": "I apologize, but I don't have the information to answer this query.",
65-
"reasoning": analysis.reasoning,
66-
"context": []
67-
}
63+
return self._generate_general_response(query)
6864

6965
# Retrieve relevant context based on query type
7066
if analysis.query_type == "pdf_documents":
7167
context = self.vector_store.query_pdf_collection(query)
7268
else:
7369
context = self.vector_store.query_general_collection(query)
7470

75-
# Generate response using context
71+
# Generate response using context if available, otherwise use general knowledge
7672
if context and analysis.requires_context:
7773
response = self._generate_response(query, context)
7874
else:
79-
response = {
80-
"answer": "I couldn't find relevant information to answer your query.",
81-
"reasoning": analysis.reasoning,
82-
"context": []
83-
}
75+
response = self._generate_general_response(query)
8476

8577
return response
8678

@@ -128,6 +120,32 @@ def _generate_response(self, query: str, context: List[Dict[str, Any]]) -> Dict[
128120
"context": context
129121
}
130122

123+
def _generate_general_response(self, query: str) -> Dict[str, Any]:
124+
"""Generate a response using general knowledge when no context is available"""
125+
if self.use_cot:
126+
template = """You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
127+
128+
Please answer the following query using chain of thought reasoning:
129+
130+
Query: {query}
131+
132+
Let's think about this step by step:"""
133+
else:
134+
template = """You are a helpful AI assistant. While I don't have specific information from my document collection about this query, I'll share what I generally know about it.
135+
136+
Query: {query}
137+
138+
Answer:"""
139+
140+
prompt = ChatPromptTemplate.from_template(template)
141+
messages = prompt.format_messages(query=query)
142+
response = self.llm.invoke(messages)
143+
144+
return {
145+
"answer": "I didn't find specific information in my documents, but here's what I know about it:\n\n" + response.content,
146+
"context": []
147+
}
148+
131149
def main():
132150
parser = argparse.ArgumentParser(description="Query documents using OpenAI GPT-4")
133151
parser.add_argument("--query", required=True, help="Query to process")

agentic_rag/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ accelerate
1313
pyyaml
1414
trafilatura
1515
gradio
16-
lxml_html_clean
16+
lxml_html_clean
17+
langchain

0 commit comments

Comments
 (0)