77import time
88from collections import deque
99from pathlib import Path
10- from typing import Coroutine , Deque , Dict , List , Optional , Set , Tuple , cast
10+ from typing import Any , Coroutine , Optional , cast
1111
1212import magic
13+ from models_library .projects_nodes import OutputsDict
1314from pydantic import ByteSize
1415from servicelib .archiving_utils import PrunableFolder , archive_dir , unarchive_dir
1516from servicelib .async_utils import run_sequentially_in_context
@@ -49,7 +50,7 @@ def _get_size_of_value(value: ItemConcreteValue) -> int:
4950
5051
5152@run_sequentially_in_context ()
52- async def upload_outputs (outputs_path : Path , port_keys : List [str ]) -> None :
53+ async def upload_outputs (outputs_path : Path , port_keys : list [str ]) -> None :
5354 """calls to this function will get queued and invoked in sequence"""
5455 # pylint: disable=too-many-branches
5556 logger .info ("uploading data to simcore..." )
@@ -64,9 +65,9 @@ async def upload_outputs(outputs_path: Path, port_keys: List[str]) -> None:
6465 )
6566
6667 # let's gather the tasks
67- temp_paths : Deque [Path ] = deque ()
68- ports_values : Dict [str , ItemConcreteValue ] = {}
69- archiving_tasks : Deque [Coroutine [None , None , None ]] = deque ()
68+ temp_paths : deque [Path ] = deque ()
69+ ports_values : dict [str , ItemConcreteValue ] = {}
70+ archiving_tasks : deque [Coroutine [None , None , None ]] = deque ()
7071
7172 for port in (await PORTS .outputs ).values ():
7273 logger .info ("Checking port %s" , port .key )
@@ -151,7 +152,7 @@ def _is_zip_file(file_path: Path) -> bool:
151152 return f"{ mime_type } " == "application/zip"
152153
153154
154- async def _get_data_from_port (port : Port ) -> Tuple [Port , ItemConcreteValue ]:
155+ async def _get_data_from_port (port : Port ) -> tuple [Port , Optional [ ItemConcreteValue ] ]:
155156 tag = f"[{ port .key } ] "
156157 logger .info ("%s transfer started for %s" , tag , port .key )
157158 start_time = time .perf_counter ()
@@ -167,11 +168,83 @@ async def _get_data_from_port(port: Port) -> Tuple[Port, ItemConcreteValue]:
167168 size_mb ,
168169 size_mb / elapsed_time ,
169170 )
170- return ( port , ret )
171+ return port , ret
171172
172173
174+ async def _download_files (
175+ target_path : Path , download_tasks : deque [Coroutine [Any , int , Any ]]
176+ ) -> tuple [OutputsDict , ByteSize ]:
177+ transferred_bytes = 0
178+ data : OutputsDict = {}
179+
180+ if not download_tasks :
181+ return data , ByteSize (transferred_bytes )
182+
183+ # TODO: limit concurrency to avoid saturating storage+db??
184+ results : list [tuple [Port , ItemConcreteValue ]] = cast (
185+ list [tuple [Port , ItemConcreteValue ]], await logged_gather (* download_tasks )
186+ )
187+ logger .info ("completed download %s" , results )
188+ for port , value in results :
189+
190+ data [port .key ] = {"key" : port .key , "value" : value }
191+
192+ if _FILE_TYPE_PREFIX in port .property_type :
193+
194+ # if there are files, move them to the final destination
195+ downloaded_file : Optional [Path ] = cast (Optional [Path ], value )
196+ dest_path : Path = target_path / port .key
197+
198+ if not downloaded_file or not downloaded_file .exists ():
199+ # the link may be empty
200+ # remove files all files from disk when disconnecting port
201+ logger .info ("removing contents of dir %s" , dest_path )
202+ await remove_directory (
203+ dest_path , only_children = True , ignore_errors = True
204+ )
205+ continue
206+
207+ transferred_bytes = transferred_bytes + downloaded_file .stat ().st_size
208+
209+ # in case of valid file, it is either uncompressed and/or moved to the final directory
210+ logger .info ("creating directory %s" , dest_path )
211+ dest_path .mkdir (exist_ok = True , parents = True )
212+ data [port .key ] = {"key" : port .key , "value" : str (dest_path )}
213+
214+ dest_folder = PrunableFolder (dest_path )
215+
216+ if _is_zip_file (downloaded_file ):
217+ # unzip updated data to dest_path
218+ logger .info ("unzipping %s" , downloaded_file )
219+ unarchived : set [Path ] = await unarchive_dir (
220+ archive_to_extract = downloaded_file , destination_folder = dest_path
221+ )
222+
223+ dest_folder .prune (exclude = unarchived )
224+
225+ logger .info ("all unzipped in %s" , dest_path )
226+ else :
227+ logger .info ("moving %s" , downloaded_file )
228+ dest_path = dest_path / Path (downloaded_file ).name
229+ await async_on_threadpool (
230+ # pylint: disable=cell-var-from-loop
231+ lambda : shutil .move (str (downloaded_file ), dest_path )
232+ )
233+
234+ # NOTE: after the download the current value of the port
235+ # makes sure previously downloaded files are removed
236+ dest_folder .prune (exclude = {dest_path })
237+
238+ logger .info ("all moved to %s" , dest_path )
239+ else :
240+ transferred_bytes = transferred_bytes + sys .getsizeof (value )
241+
242+ return data , ByteSize (transferred_bytes )
243+
244+
245+ @run_sequentially_in_context ()
173246async def download_target_ports (
174- port_type_name : PortTypeName , target_path : Path , port_keys : List [str ]
247+ port_type_name : PortTypeName , target_path : Path , port_keys : list [str ]
175248) -> ByteSize :
176249 logger .info ("retrieving data from simcore..." )
177250 start_time = time .perf_counter ()
@@ -183,10 +256,9 @@ async def download_target_ports(
183256 node_uuid = str (settings .DY_SIDECAR_NODE_ID ),
184257 r_clone_settings = settings .DY_SIDECAR_R_CLONE_SETTINGS ,
185258 )
186- data = {}
187259
188260 # let's gather all the data
189- download_tasks = []
261+ download_tasks : deque [ Coroutine [ Any , int , Any ]] = deque ()
190262 for port_value in (await getattr (PORTS , port_type_name .value )).values ():
191263 # if port_keys contains some keys only download them
192264 logger .info ("Checking node %s" , port_value .key )
@@ -196,61 +268,7 @@ async def download_target_ports(
196268 download_tasks .append (_get_data_from_port (port_value ))
197269 logger .info ("retrieving %s data" , len (download_tasks ))
198270
199- transfer_bytes = 0
200- if download_tasks :
201- # TODO: limit concurrency to avoid saturating storage+db??
202- results : List [Tuple [Port , ItemConcreteValue ]] = cast (
203- List [Tuple [Port , ItemConcreteValue ]], await logged_gather (* download_tasks )
204- )
205- logger .info ("completed download %s" , results )
206- for port , value in results :
207-
208- data [port .key ] = {"key" : port .key , "value" : value }
209-
210- if _FILE_TYPE_PREFIX in port .property_type :
211-
212- # if there are files, move them to the final destination
213- downloaded_file : Optional [Path ] = cast (Optional [Path ], value )
214- dest_path : Path = target_path / port .key
215-
216- if not downloaded_file or not downloaded_file .exists ():
217- # the link may be empty
218- # remove files all files from disk when disconnecting port
219- await remove_directory (
220- dest_path , only_children = True , ignore_errors = True
221- )
222- continue
223-
224- transfer_bytes = transfer_bytes + downloaded_file .stat ().st_size
225-
226- # in case of valid file, it is either uncompressed and/or moved to the final directory
227- logger .info ("creating directory %s" , dest_path )
228- dest_path .mkdir (exist_ok = True , parents = True )
229- data [port .key ] = {"key" : port .key , "value" : str (dest_path )}
230-
231- if _is_zip_file (downloaded_file ):
232-
233- dest_folder = PrunableFolder (dest_path )
234-
235- # unzip updated data to dest_path
236- logger .info ("unzipping %s" , downloaded_file )
237- unarchived : Set [Path ] = await unarchive_dir (
238- archive_to_extract = downloaded_file , destination_folder = dest_path
239- )
240-
241- dest_folder .prune (exclude = unarchived )
242-
243- logger .info ("all unzipped in %s" , dest_path )
244- else :
245- logger .info ("moving %s" , downloaded_file )
246- dest_path = dest_path / Path (downloaded_file ).name
247- await async_on_threadpool (
248- # pylint: disable=cell-var-from-loop
249- lambda : shutil .move (str (downloaded_file ), dest_path )
250- )
251- logger .info ("all moved to %s" , dest_path )
252- else :
253- transfer_bytes = transfer_bytes + sys .getsizeof (value )
271+ data , transferred_bytes = await _download_files (target_path , download_tasks )
254272
255273 # create/update the json file with the new values
256274 if data :
@@ -261,15 +279,13 @@ async def download_target_ports(
261279 data = {** current_data , ** data }
262280 data_file .write_text (json .dumps (data ))
263281
264- transferred = ByteSize (transfer_bytes )
265282 elapsed_time = time .perf_counter () - start_time
266283 logger .info (
267284 "Downloaded %s in %s seconds" ,
268- transferred .human_readable (decimal = True ),
285+ transferred_bytes .human_readable (decimal = True ),
269286 elapsed_time ,
270287 )
271-
272- return transferred
288+ return transferred_bytes
273289
274290
275291__all__ = ["dispatch_update_for_directory" , "download_target_ports" ]
0 commit comments