Skip to content

Commit 554e0d6

Browse files
committed
fix: add document_id handling in PDF processor - Generate unique document_id for each PDF - Add document_id to chunk metadata - Update return values to include document_id - Update all methods to handle new return format
1 parent bd7b8a7 commit 554e0d6

File tree

1 file changed

+22
-8
lines changed

1 file changed

+22
-8
lines changed

agentic_rag/pdf_processor.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from urllib.parse import urlparse
88
import warnings
99
import transformers
10+
import uuid # Add at the top with other imports
1011

1112
# Suppress the token length warning
1213
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.generation.utils')
@@ -62,6 +63,9 @@ def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
6263
def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
6364
"""Process a PDF file and return chunks of text with metadata"""
6465
try:
66+
# Generate a unique document ID
67+
document_id = str(uuid.uuid4())
68+
6569
# Convert PDF using Docling
6670
conv_result = self.converter.convert(file_path)
6771
if not conv_result or not conv_result.document:
@@ -85,14 +89,15 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
8589

8690
metadata = self._extract_metadata(meta)
8791
metadata["source"] = str(file_path)
92+
metadata["document_id"] = document_id # Add document_id to metadata
8893

8994
processed_chunk = {
9095
"text": text,
9196
"metadata": metadata
9297
}
9398
processed_chunks.append(processed_chunk)
9499

95-
return processed_chunks
100+
return processed_chunks, document_id # Return both chunks and document_id
96101

97102
except Exception as e:
98103
raise Exception(f"Error processing PDF {file_path}: {str(e)}")
@@ -105,6 +110,9 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
105110
if not conv_result or not conv_result.document:
106111
raise ValueError(f"Failed to convert PDF from URL: {url}")
107112

113+
# Generate a unique document ID
114+
document_id = str(uuid.uuid4())
115+
108116
# Chunk the document
109117
chunks = list(self.chunker.chunk(conv_result.document))
110118

@@ -117,14 +125,15 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
117125

118126
metadata = self._extract_metadata(meta)
119127
metadata["source"] = url
128+
metadata["document_id"] = document_id
120129

121130
processed_chunk = {
122131
"text": text,
123132
"metadata": metadata
124133
}
125134
processed_chunks.append(processed_chunk)
126135

127-
return processed_chunks
136+
return processed_chunks, document_id
128137

129138
except Exception as e:
130139
raise Exception(f"Error processing PDF from URL {url}: {str(e)}")
@@ -133,16 +142,18 @@ def process_directory(self, directory: str | Path) -> List[Dict[str, Any]]:
133142
"""Process all PDF files in a directory"""
134143
directory = Path(directory)
135144
all_chunks = []
145+
document_ids = []
136146

137147
for pdf_file in directory.glob("**/*.pdf"):
138148
try:
139-
chunks = self.process_pdf(pdf_file)
149+
chunks, doc_id = self.process_pdf(pdf_file)
140150
all_chunks.extend(chunks)
141-
print(f"✓ Processed {pdf_file}")
151+
document_ids.append(doc_id)
152+
print(f"✓ Processed {pdf_file} (ID: {doc_id})")
142153
except Exception as e:
143154
print(f"✗ Failed to process {pdf_file}: {str(e)}")
144155

145-
return all_chunks
156+
return all_chunks, document_ids
146157

147158
def _extract_page_numbers(self, meta: Any) -> List[int]:
148159
"""Extract page numbers from chunk metadata"""
@@ -192,15 +203,18 @@ def main():
192203
if is_url(args.input):
193204
print(f"\nProcessing PDF from URL: {args.input}")
194205
print("=" * 50)
195-
chunks = processor.process_pdf_url(args.input)
206+
chunks, doc_id = processor.process_pdf_url(args.input)
207+
print(f"Document ID: {doc_id}")
196208
elif Path(args.input).is_dir():
197209
print(f"\nProcessing directory: {args.input}")
198210
print("=" * 50)
199-
chunks = processor.process_directory(args.input)
211+
chunks, doc_ids = processor.process_directory(args.input)
212+
print(f"Document IDs: {', '.join(doc_ids)}")
200213
else:
201214
print(f"\nProcessing file: {args.input}")
202215
print("=" * 50)
203-
chunks = processor.process_pdf(args.input)
216+
chunks, doc_id = processor.process_pdf(args.input)
217+
print(f"Document ID: {doc_id}")
204218

205219
# Save chunks to JSON
206220
with open(args.output, 'w', encoding='utf-8') as f:

0 commit comments

Comments
 (0)