Skip to content

Commit 1573aaf

Browse files
committed
fix: Improve PDF processing robustness - Handle both dictionary and DocChunk objects - Add proper URL processing support - Create output directory if not exists - Fix chunk metadata extraction
1 parent 3f1fd46 commit 1573aaf

File tree

1 file changed

+18
-6
lines changed

1 file changed

+18
-6
lines changed

agentic_rag/pdf_processor.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,16 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
3434
# Process chunks into a standardized format
3535
processed_chunks = []
3636
for chunk in chunks:
37+
# Handle both dictionary and DocChunk objects
38+
text = chunk.text if hasattr(chunk, 'text') else chunk.get('text', '')
39+
meta = chunk.meta if hasattr(chunk, 'meta') else chunk.get('meta', {})
40+
3741
processed_chunk = {
38-
"text": chunk["text"],
42+
"text": text,
3943
"metadata": {
4044
"source": str(file_path),
41-
"headings": chunk["meta"].get("headings", []),
42-
"page_numbers": self._extract_page_numbers(chunk["meta"]),
45+
"headings": meta.get("headings", []),
46+
"page_numbers": self._extract_page_numbers(meta),
4347
}
4448
}
4549
processed_chunks.append(processed_chunk)
@@ -63,12 +67,16 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
6367
# Process chunks into a standardized format
6468
processed_chunks = []
6569
for chunk in chunks:
70+
# Handle both dictionary and DocChunk objects
71+
text = chunk.text if hasattr(chunk, 'text') else chunk.get('text', '')
72+
meta = chunk.meta if hasattr(chunk, 'meta') else chunk.get('meta', {})
73+
6674
processed_chunk = {
67-
"text": chunk["text"],
75+
"text": text,
6876
"metadata": {
6977
"source": url,
70-
"headings": chunk["meta"].get("headings", []),
71-
"page_numbers": self._extract_page_numbers(chunk["meta"]),
78+
"headings": meta.get("headings", []),
79+
"page_numbers": self._extract_page_numbers(meta),
7280
}
7381
}
7482
processed_chunks.append(processed_chunk)
@@ -115,6 +123,10 @@ def main():
115123
processor = PDFProcessor(tokenizer=args.tokenizer)
116124

117125
try:
126+
# Create output directory if it doesn't exist
127+
output_dir = Path(args.output).parent
128+
output_dir.mkdir(parents=True, exist_ok=True)
129+
118130
if is_url(args.input):
119131
print(f"\nProcessing PDF from URL: {args.input}")
120132
print("=" * 50)

0 commit comments

Comments
 (0)