Skip to content

Commit 2308831

Browse files
committed
fix: Improve PDF processing robustness - Add better metadata extraction for different objct types - Reduce chunk size to avoid token length errors - Add better error handling for metadata extraction - Support both attribute and dictionary access
1 parent 1573aaf commit 2308831

File tree

1 file changed

+64
-19
lines changed

1 file changed

+64
-19
lines changed

agentic_rag/pdf_processor.py

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,35 @@ class PDFProcessor:
1818
def __init__(self, tokenizer: str = "BAAI/bge-small-en-v1.5"):
1919
"""Initialize PDF processor with Docling components"""
2020
self.converter = DocumentConverter()
21-
self.chunker = HybridChunker(tokenizer=tokenizer)
21+
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=384) # Reduced chunk size
22+
23+
def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
24+
"""Safely extract metadata from various object types"""
25+
try:
26+
if hasattr(meta, '__dict__'):
27+
# If it's an object with attributes
28+
return {
29+
"headings": getattr(meta, "headings", []),
30+
"page_numbers": self._extract_page_numbers(meta)
31+
}
32+
elif isinstance(meta, dict):
33+
# If it's a dictionary
34+
return {
35+
"headings": meta.get("headings", []),
36+
"page_numbers": self._extract_page_numbers(meta)
37+
}
38+
else:
39+
# Default empty metadata
40+
return {
41+
"headings": [],
42+
"page_numbers": []
43+
}
44+
except Exception as e:
45+
print(f"Warning: Error extracting metadata: {str(e)}")
46+
return {
47+
"headings": [],
48+
"page_numbers": []
49+
}
2250

2351
def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
2452
"""Process a PDF file and return chunks of text with metadata"""
@@ -38,13 +66,12 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
3866
text = chunk.text if hasattr(chunk, 'text') else chunk.get('text', '')
3967
meta = chunk.meta if hasattr(chunk, 'meta') else chunk.get('meta', {})
4068

69+
metadata = self._extract_metadata(meta)
70+
metadata["source"] = str(file_path)
71+
4172
processed_chunk = {
4273
"text": text,
43-
"metadata": {
44-
"source": str(file_path),
45-
"headings": meta.get("headings", []),
46-
"page_numbers": self._extract_page_numbers(meta),
47-
}
74+
"metadata": metadata
4875
}
4976
processed_chunks.append(processed_chunk)
5077

@@ -71,13 +98,12 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
7198
text = chunk.text if hasattr(chunk, 'text') else chunk.get('text', '')
7299
meta = chunk.meta if hasattr(chunk, 'meta') else chunk.get('meta', {})
73100

101+
metadata = self._extract_metadata(meta)
102+
metadata["source"] = url
103+
74104
processed_chunk = {
75105
"text": text,
76-
"metadata": {
77-
"source": url,
78-
"headings": meta.get("headings", []),
79-
"page_numbers": self._extract_page_numbers(meta),
80-
}
106+
"metadata": metadata
81107
}
82108
processed_chunks.append(processed_chunk)
83109

@@ -101,16 +127,35 @@ def process_directory(self, directory: str | Path) -> List[Dict[str, Any]]:
101127

102128
return all_chunks
103129

104-
def _extract_page_numbers(self, meta: Dict) -> List[int]:
130+
def _extract_page_numbers(self, meta: Any) -> List[int]:
105131
"""Extract page numbers from chunk metadata"""
106132
page_numbers = set()
107-
if "doc_items" in meta:
108-
for item in meta["doc_items"]:
109-
if "prov" in item:
110-
for prov in item["prov"]:
111-
if "page_no" in prov:
112-
page_numbers.add(prov["page_no"])
113-
return sorted(list(page_numbers))
133+
try:
134+
if hasattr(meta, 'doc_items'):
135+
items = meta.doc_items
136+
elif isinstance(meta, dict) and 'doc_items' in meta:
137+
items = meta['doc_items']
138+
else:
139+
return []
140+
141+
for item in items:
142+
if hasattr(item, 'prov'):
143+
provs = item.prov
144+
elif isinstance(item, dict) and 'prov' in item:
145+
provs = item['prov']
146+
else:
147+
continue
148+
149+
for prov in provs:
150+
if hasattr(prov, 'page_no'):
151+
page_numbers.add(prov.page_no)
152+
elif isinstance(prov, dict) and 'page_no' in prov:
153+
page_numbers.add(prov['page_no'])
154+
155+
return sorted(list(page_numbers))
156+
except Exception as e:
157+
print(f"Warning: Error extracting page numbers: {str(e)}")
158+
return []
114159

115160
def main():
116161
parser = argparse.ArgumentParser(description="Process PDF files and extract text chunks")

0 commit comments

Comments
 (0)