99import boto3
1010from aiocache import SimpleMemoryCache # type: ignore[import-untyped]
1111from fastapi .applications import FastAPI
12+ from models_library .api_schemas_datcore_adapter .datasets import (
13+ DatasetMetaData ,
14+ DataType ,
15+ FileMetaData ,
16+ )
1217from servicelib .logging_utils import log_context
1318from servicelib .utils import logged_gather
1419from starlette import status
1924from tenacity .stop import stop_after_attempt
2025
2126from ..core .settings import PennsieveSettings
22- from ..models .domains . user import Profile
23- from ..models .schemas . datasets import DatasetMetaData , FileMetaData
27+ from ..models .files import DatCorePackageMetaData
28+ from ..models .user import Profile
2429from ..utils .client_base import BaseServiceClientApi , setup_client_instance
2530
2631logger = logging .getLogger (__name__ )
2934_GATHER_MAX_CONCURRENCY = 10
3035
3136
37+ def _to_file_meta_data (
38+ package : dict [str , Any ], files : list [DatCorePackageMetaData ], base_path : Path
39+ ) -> FileMetaData :
40+ """creates a FileMetaData from a pennsieve data structure."""
41+ pck_name : str = package ["content" ]["name" ]
42+ if "extension" in package and not pck_name .endswith (package ["extension" ]):
43+ pck_name += "." .join ((pck_name , package ["extension" ]))
44+
45+ file_size = 0
46+ if package ["content" ]["packageType" ] != "Collection" and files :
47+ file_size = files [0 ].size
48+
49+ return FileMetaData (
50+ dataset_id = package ["content" ]["datasetNodeId" ],
51+ package_id = package ["content" ]["nodeId" ],
52+ id = f"{ package ['content' ]['id' ]} " ,
53+ name = pck_name ,
54+ path = base_path / pck_name ,
55+ type = package ["content" ]["packageType" ],
56+ size = file_size ,
57+ created_at = package ["content" ]["createdAt" ],
58+ last_modified_at = package ["content" ]["updatedAt" ],
59+ data_type = (
60+ DataType .FOLDER
61+ if package ["content" ]["packageType" ] == "Collection"
62+ else DataType .FILE
63+ ),
64+ )
65+
66+
3267def _compute_file_path (
3368 all_packages : dict [str , dict [str , Any ]], pck : dict [str , Any ]
3469) -> Path :
@@ -215,27 +250,66 @@ async def _get_package(
215250 )
216251
217252 async def get_package_files (
218- self , api_key : str , api_secret : str , package_id : str , limit : int , offset : int
219- ) -> list [dict [str , Any ]]:
220- return cast (
221- list [dict [str , Any ]],
222- await self ._request (
223- api_key ,
224- api_secret ,
225- "GET" ,
226- f"/packages/{ package_id } /files" ,
227- params = {"limit" : limit , "offset" : offset },
228- ),
253+ self ,
254+ * ,
255+ api_key : str ,
256+ api_secret : str ,
257+ package_id : str ,
258+ limit : int ,
259+ offset : int ,
260+ fill_path : bool ,
261+ ) -> list [DatCorePackageMetaData ]:
262+ raw_data = await self ._request (
263+ api_key ,
264+ api_secret ,
265+ "GET" ,
266+ f"/packages/{ package_id } /files" ,
267+ params = {"limit" : limit , "offset" : offset },
229268 )
269+ path = display_path = Path ()
270+ if fill_path :
271+ package_info = await self ._get_package (api_key , api_secret , package_id )
272+ dataset_id = package_info ["content" ]["datasetId" ]
273+ dataset = await self ._get_dataset (api_key , api_secret , dataset_id )
274+
275+ path = (
276+ Path (dataset_id )
277+ / Path (
278+ "/" .join (
279+ ancestor ["content" ]["id" ]
280+ for ancestor in package_info .get ("ancestors" , [])
281+ )
282+ )
283+ / Path (package_info ["content" ]["name" ])
284+ )
285+ display_path = (
286+ Path (dataset ["content" ]["name" ])
287+ / Path (
288+ "/" .join (
289+ ancestor ["content" ]["name" ]
290+ for ancestor in package_info .get ("ancestors" , [])
291+ )
292+ )
293+ / Path (package_info ["content" ]["name" ])
294+ )
295+
296+ return [
297+ DatCorePackageMetaData (** _ ["content" ], path = path , display_path = display_path )
298+ for _ in raw_data
299+ ]
230300
231301 async def _get_pck_id_files (
232302 self , api_key : str , api_secret : str , pck_id : str , pck : dict [str , Any ]
233- ) -> tuple [str , list [dict [str , Any ]]]:
234-
303+ ) -> tuple [str , list [DatCorePackageMetaData ]]:
235304 return (
236305 pck_id ,
237306 await self .get_package_files (
238- api_key , api_secret , pck ["content" ]["nodeId" ], limit = 1 , offset = 0
307+ api_key = api_key ,
308+ api_secret = api_secret ,
309+ package_id = pck ["content" ]["nodeId" ],
310+ limit = 1 ,
311+ offset = 0 ,
312+ fill_path = False ,
239313 ),
240314 )
241315
@@ -293,7 +367,7 @@ async def list_packages_in_dataset(
293367 for pck in islice (dataset_pck ["children" ], offset , offset + limit )
294368 if pck ["content" ]["packageType" ] != "Collection"
295369 ]
296- package_files = dict (
370+ package_files : dict [ str , list [ DatCorePackageMetaData ]] = dict (
297371 await logged_gather (
298372 * package_files_tasks ,
299373 log = logger ,
@@ -302,7 +376,7 @@ async def list_packages_in_dataset(
302376 )
303377 return (
304378 [
305- FileMetaData . from_pennsieve_package (
379+ _to_file_meta_data (
306380 pck ,
307381 (
308382 package_files [pck ["content" ]["id" ]]
@@ -353,7 +427,7 @@ async def list_packages_in_collection(
353427
354428 return (
355429 [
356- FileMetaData . from_pennsieve_package (
430+ _to_file_meta_data (
357431 pck ,
358432 (
359433 package_files [pck ["content" ]["id" ]]
@@ -433,7 +507,7 @@ async def list_all_dataset_files(
433507 file_path = base_path / _compute_file_path (all_packages , package )
434508
435509 file_meta_data .append (
436- FileMetaData . from_pennsieve_package (
510+ _to_file_meta_data (
437511 package , package_files [package_id ], file_path .parent
438512 )
439513 )
@@ -445,11 +519,16 @@ async def get_presigned_download_link(
445519 ) -> URL :
446520 """returns the presigned download link of the first file in the package"""
447521 files = await self .get_package_files (
448- api_key , api_secret , package_id , limit = 1 , offset = 0
522+ api_key = api_key ,
523+ api_secret = api_secret ,
524+ package_id = package_id ,
525+ limit = 1 ,
526+ offset = 0 ,
527+ fill_path = False ,
449528 )
450529 # NOTE: this was done like this in the original dsm. we might encounter a problem when there are more than one files
451530 assert len (files ) == 1 # nosec
452- file_id = files [0 ][ "content" ][ "id" ]
531+ file_id = files [0 ]. id
453532 file_link = cast (
454533 dict [str , Any ],
455534 await self ._request (
0 commit comments