1+ import asyncio
12import hashlib
23import json
34from dataclasses import dataclass
4- from typing import Optional , TypedDict , TypeVar
5+ from typing import Callable , Optional , TypedDict , TypeVar
56
67from unstructured .ingest .v2 .interfaces import FileData , download_responses
78from unstructured .ingest .v2 .interfaces .downloader import Downloader
@@ -55,7 +56,7 @@ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
5556 if self .context .re_download :
5657 return True
5758 download_path = self .process .get_download_path (file_data = file_data )
58- if not download_path .exists ():
59+ if not download_path or not download_path .exists ():
5960 return True
6061 if (
6162 download_path .is_file ()
@@ -69,6 +70,24 @@ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
6970 return True
7071 return False
7172
73+ async def _run_async (self , fn : Callable , file_data_path : str ) -> list [DownloadStepResponse ]:
74+ file_data = FileData .from_file (path = file_data_path )
75+ download_path = self .process .get_download_path (file_data = file_data )
76+ if not self .should_download (file_data = file_data , file_data_path = file_data_path ):
77+ logger .debug (f"Skipping download, file already exists locally: { download_path } " )
78+ return [DownloadStepResponse (file_data_path = file_data_path , path = str (download_path ))]
79+ fn_kwargs = {"file_data" : file_data }
80+ if not asyncio .iscoroutinefunction (fn ):
81+ download_results = fn (** fn_kwargs )
82+ elif semaphore := self .context .semaphore :
83+ async with semaphore :
84+ download_results = await fn (** fn_kwargs )
85+ else :
86+ download_results = await fn (** fn_kwargs )
87+ return self .create_step_results (
88+ current_file_data_path = file_data_path , download_results = download_results
89+ )
90+
7291 def create_step_results (
7392 self , current_file_data_path : str , download_results : download_responses
7493 ) -> list [DownloadStepResponse ]:
@@ -87,35 +106,6 @@ def create_step_results(
87106 )
88107 return download_step_results
89108
90- def _run (self , file_data_path : str ) -> list [DownloadStepResponse ]:
91- file_data = FileData .from_file (path = file_data_path )
92- download_path = self .process .get_download_path (file_data = file_data )
93- if not self .should_download (file_data = file_data , file_data_path = file_data_path ):
94- logger .debug (f"Skipping download, file already exists locally: { download_path } " )
95- return [DownloadStepResponse (file_data_path = file_data_path , path = str (download_path ))]
96-
97- download_results = self .process .run (file_data = file_data )
98- return self .create_step_results (
99- current_file_data_path = file_data_path , download_results = download_results
100- )
101-
102- async def _run_async (self , file_data_path : str ) -> list [DownloadStepResponse ]:
103- file_data = FileData .from_file (path = file_data_path )
104- download_path = self .process .get_download_path (file_data = file_data )
105- if download_path and not self .should_download (
106- file_data = file_data , file_data_path = file_data_path
107- ):
108- logger .debug (f"Skipping download, file already exists locally: { download_path } " )
109- return [DownloadStepResponse (file_data_path = file_data_path , path = str (download_path ))]
110- if semaphore := self .context .semaphore :
111- async with semaphore :
112- download_results = await self .process .run_async (file_data = file_data )
113- else :
114- download_results = await self .process .run_async (file_data = file_data )
115- return self .create_step_results (
116- current_file_data_path = file_data_path , download_results = download_results
117- )
118-
119109 def persist_new_file_data (self , file_data : FileData ) -> str :
120110 record_hash = self .get_hash (extras = [file_data .identifier ])
121111 filename = f"{ record_hash } .json"
0 commit comments