77from pathlib import Path
88from time import time
99from typing import TYPE_CHECKING , Any , Generator , Optional , TypeVar
10+ from uuid import NAMESPACE_DNS , uuid5
1011
1112from unstructured .documents .elements import DataSourceMetadata
1213
@@ -210,16 +211,19 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
210211 # Note: we remove any remaining leading slashes (Box introduces these)
211212 # to get a valid relative path
212213 rel_path = file .replace (self .index_config .path_without_protocol , "" ).lstrip ("/" )
214+
215+ additional_metadata = self .sterilize_info (path = file )
216+ additional_metadata ["original_file_path" ] = file
213217 yield FileData (
214- identifier = file ,
218+ identifier = str ( uuid5 ( NAMESPACE_DNS , file )) ,
215219 connector_type = self .connector_type ,
216220 source_identifiers = SourceIdentifiers (
217221 filename = Path (file ).name ,
218222 rel_path = rel_path or None ,
219223 fullpath = file ,
220224 ),
221225 metadata = self .get_metadata (path = file ),
222- additional_metadata = self . sterilize_info ( path = file ) ,
226+ additional_metadata = additional_metadata ,
223227 )
224228
225229
@@ -262,7 +266,8 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
262266 download_path = self .get_download_path (file_data = file_data )
263267 download_path .parent .mkdir (parents = True , exist_ok = True )
264268 try :
265- self .fs .get (rpath = file_data .identifier , lpath = download_path .as_posix ())
269+ rpath = file_data .additional_metadata ["original_file_path" ]
270+ self .fs .get (rpath = rpath , lpath = download_path .as_posix ())
266271 except Exception as e :
267272 logger .error (f"failed to download file { file_data .identifier } : { e } " , exc_info = True )
268273 raise SourceConnectionNetworkError (f"failed to download file { file_data .identifier } " )
@@ -272,7 +277,8 @@ async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadRespons
272277 download_path = self .get_download_path (file_data = file_data )
273278 download_path .parent .mkdir (parents = True , exist_ok = True )
274279 try :
275- await self .fs .get (rpath = file_data .identifier , lpath = download_path .as_posix ())
280+ rpath = file_data .additional_metadata ["original_file_path" ]
281+ await self .fs .get (rpath = rpath , lpath = download_path .as_posix ())
276282 except Exception as e :
277283 logger .error (f"failed to download file { file_data .identifier } : { e } " , exc_info = True )
278284 raise SourceConnectionNetworkError (f"failed to download file { file_data .identifier } " )
0 commit comments