7
7
from urllib .parse import urlparse
8
8
import warnings
9
9
import transformers
10
+ import uuid # Add at the top with other imports
10
11
11
12
# Suppress the token length warning
12
13
warnings .filterwarnings ('ignore' , category = UserWarning , module = 'transformers.generation.utils' )
@@ -62,6 +63,9 @@ def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
62
63
def process_pdf (self , file_path : str | Path ) -> List [Dict [str , Any ]]:
63
64
"""Process a PDF file and return chunks of text with metadata"""
64
65
try :
66
+ # Generate a unique document ID
67
+ document_id = str (uuid .uuid4 ())
68
+
65
69
# Convert PDF using Docling
66
70
conv_result = self .converter .convert (file_path )
67
71
if not conv_result or not conv_result .document :
@@ -85,14 +89,15 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
85
89
86
90
metadata = self ._extract_metadata (meta )
87
91
metadata ["source" ] = str (file_path )
92
+ metadata ["document_id" ] = document_id # Add document_id to metadata
88
93
89
94
processed_chunk = {
90
95
"text" : text ,
91
96
"metadata" : metadata
92
97
}
93
98
processed_chunks .append (processed_chunk )
94
99
95
- return processed_chunks
100
+ return processed_chunks , document_id # Return both chunks and document_id
96
101
97
102
except Exception as e :
98
103
raise Exception (f"Error processing PDF { file_path } : { str (e )} " )
@@ -105,6 +110,9 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
105
110
if not conv_result or not conv_result .document :
106
111
raise ValueError (f"Failed to convert PDF from URL: { url } " )
107
112
113
+ # Generate a unique document ID
114
+ document_id = str (uuid .uuid4 ())
115
+
108
116
# Chunk the document
109
117
chunks = list (self .chunker .chunk (conv_result .document ))
110
118
@@ -117,14 +125,15 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
117
125
118
126
metadata = self ._extract_metadata (meta )
119
127
metadata ["source" ] = url
128
+ metadata ["document_id" ] = document_id
120
129
121
130
processed_chunk = {
122
131
"text" : text ,
123
132
"metadata" : metadata
124
133
}
125
134
processed_chunks .append (processed_chunk )
126
135
127
- return processed_chunks
136
+ return processed_chunks , document_id
128
137
129
138
except Exception as e :
130
139
raise Exception (f"Error processing PDF from URL { url } : { str (e )} " )
@@ -133,16 +142,18 @@ def process_directory(self, directory: str | Path) -> List[Dict[str, Any]]:
133
142
"""Process all PDF files in a directory"""
134
143
directory = Path (directory )
135
144
all_chunks = []
145
+ document_ids = []
136
146
137
147
for pdf_file in directory .glob ("**/*.pdf" ):
138
148
try :
139
- chunks = self .process_pdf (pdf_file )
149
+ chunks , doc_id = self .process_pdf (pdf_file )
140
150
all_chunks .extend (chunks )
141
- print (f"✓ Processed { pdf_file } " )
151
+ document_ids .append (doc_id )
152
+ print (f"✓ Processed { pdf_file } (ID: { doc_id } )" )
142
153
except Exception as e :
143
154
print (f"✗ Failed to process { pdf_file } : { str (e )} " )
144
155
145
- return all_chunks
156
+ return all_chunks , document_ids
146
157
147
158
def _extract_page_numbers (self , meta : Any ) -> List [int ]:
148
159
"""Extract page numbers from chunk metadata"""
@@ -192,15 +203,18 @@ def main():
192
203
if is_url (args .input ):
193
204
print (f"\n Processing PDF from URL: { args .input } " )
194
205
print ("=" * 50 )
195
- chunks = processor .process_pdf_url (args .input )
206
+ chunks , doc_id = processor .process_pdf_url (args .input )
207
+ print (f"Document ID: { doc_id } " )
196
208
elif Path (args .input ).is_dir ():
197
209
print (f"\n Processing directory: { args .input } " )
198
210
print ("=" * 50 )
199
- chunks = processor .process_directory (args .input )
211
+ chunks , doc_ids = processor .process_directory (args .input )
212
+ print (f"Document IDs: { ', ' .join (doc_ids )} " )
200
213
else :
201
214
print (f"\n Processing file: { args .input } " )
202
215
print ("=" * 50 )
203
- chunks = processor .process_pdf (args .input )
216
+ chunks , doc_id = processor .process_pdf (args .input )
217
+ print (f"Document ID: { doc_id } " )
204
218
205
219
# Save chunks to JSON
206
220
with open (args .output , 'w' , encoding = 'utf-8' ) as f :
0 commit comments