11from pathlib import Path
22
3- import urllib
3+ from urllib .parse import urljoin
4+ from urllib .error import HTTPError
45import uuid
56
67import requests
7- from hdxms_datasets .loader import BACKEND
8- from hdxms_datasets .models import HDXDataSet
8+ from hdxms_datasets .loader import BACKEND , read_csv
9+ from hdxms_datasets .models import HDXDataSet , extract_values_by_types
910import shutil
1011import narwhals as nw
1112
1213from hdxms_datasets .utils import records_to_dict
1314from hdxms_datasets .verification import verify_dataset
1415
1516
17+ CATALOG_FILE = "datasets_catalog.csv"
18+ DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDXMS-database/master/datasets/"
19+
20+
1621def load_dataset (pth : Path ) -> HDXDataSet :
1722 """
1823 Load a dataset from a JSON file or directory.
@@ -235,9 +240,6 @@ def load_dataset(self, dataset_id: str) -> HDXDataSet:
235240 return dataset
236241
237242
238- DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/"
239-
240-
241243class RemoteDataBase (DataBase ):
242244 """
243245 A database for HDX-MS datasets, with the ability to fetch datasets from a remote repository.
@@ -247,80 +249,88 @@ class RemoteDataBase(DataBase):
247249 remote_url: URL of the remote repository (default: DATABASE_URL).
248250 """
249251
250- def __init__ (self , database_dir : Path | str , remote_url : str = DATABASE_URL ):
252+ def __init__ (
253+ self ,
254+ database_dir : Path | str ,
255+ remote_url : str = DATABASE_URL ,
256+ ):
251257 super ().__init__ (database_dir )
252258 self .remote_url = remote_url
253259
254- def get_index (self ) -> nw .DataFrame :
255- """Retrieves the index of available datasets
260+ index_url = urljoin (DATABASE_URL , CATALOG_FILE )
261+ response = requests .get (index_url )
262+
263+ # TODO keep catalogs on a per-url basis in a singleton
264+ if response .ok :
265+ df = read_csv (response .content )
266+ self .datasets_catalog = df
267+ else :
268+ raise HTTPError (
269+ index_url ,
270+ response .status_code ,
271+ "Error fetching dataset index" ,
272+ response .headers , # type: ignore
273+ None ,
274+ )
256275
257- on success, returns the index dataframe and
258- stores as `remote_index` attribute.
276+ @property
277+ def remote_datasets (self ) -> list [str ]:
278+ """List of available datasets in the remote repository"""
279+ return self .datasets_catalog ["id" ].to_list ()
259280
260- """
261- raise NotImplementedError ()
281+ @property
282+ def local_datasets (self ) -> list [str ]:
283+ """List of available datasets in the local database directory"""
284+ return self .datasets
262285
263- def fetch_dataset (self , data_id : str ) -> bool :
286+ def fetch_dataset (self , data_id : str ) -> tuple [ bool , str ] :
264287 """
265- Download a dataset from the online repository to the cache dir
288+ Download a dataset from the online repository to `database_dir`
266289
267290 Args:
268291 data_id: The ID of the dataset to download.
269292
270293 Returns:
271- `True` if the dataset was downloaded successfully, `False` otherwise.
294+ A tuple (success: bool, message: str):
295+ - success: True if the dataset was successfully downloaded, False otherwise.
296+ - message: A message indicating the result of the download.
272297 """
273298
274- raise NotImplementedError ()
275- output_pth = self .cache_dir / data_id
276- if output_pth .exists ():
277- return False
278- else :
279- output_pth .mkdir ()
280-
281- dataset_url = urllib .parse .urljoin (self .remote_url , data_id + "/" )
282-
283- files = ["hdx_spec.yaml" , "metadata.yaml" ]
284- hdx_spec = None
285- for f in files + optional_files :
286- url = urllib .parse .urljoin (dataset_url , f )
287- response = requests .get (url )
299+ if data_id not in self .remote_datasets :
300+ return False , f"Dataset ID { data_id !r} not found in remote database."
288301
289- if response .ok :
290- (output_pth / f ).write_bytes (response .content )
291-
292- elif f in files :
293- raise urllib .error .HTTPError (
294- url ,
295- response .status_code ,
296- f"Error for file { f !r} " ,
297- response .headers , # type: ignore
298- None ,
299- )
302+ json_url = urljoin (DATABASE_URL , data_id + "/dataset.json" )
303+ response = requests .get (json_url )
300304
301- if f == "hdx_spec.yaml" :
302- hdx_spec = yaml .safe_load (response .text )
305+ # confirm if the json is according to spec
306+ try :
307+ dataset = HDXDataSet .model_validate_json (
308+ response .content ,
309+ )
310+ except Exception as e :
311+ return False , f"Error validating dataset JSON: { e } "
303312
304- if hdx_spec is None :
305- raise ValueError ( f"Could not find HDX spec for data_id { data_id !r } " )
313+ # create a list of all Path objects in the dataset plus the dataset.json file
314+ data_files = list ( set ( extract_values_by_types ( dataset , Path ))) + [ Path ( "dataset.json" )]
306315
307- data_pth = output_pth / "data"
308- data_pth .mkdir ()
316+ # create the target directory to store the dataset
317+ output_pth = self .database_dir / data_id
318+ if output_pth .exists ():
319+ return False , "Dataset already exists in the local database."
320+ else :
321+ output_pth .mkdir ()
309322
310- for file_spec in hdx_spec ["data_files" ].values ():
311- filename = file_spec ["filename" ]
312- f_url = urllib .parse .urljoin (dataset_url , filename )
313- response = requests .get (f_url )
323+ for data_file in data_files :
324+ data_url = urljoin (DATABASE_URL , data_id + "/" + data_file .as_posix ())
314325
326+ response = requests .get (data_url )
315327 if response .ok :
316- (output_pth / filename ).write_bytes (response .content )
328+ # write the file to disk
329+ fpath = output_pth / Path (data_file )
330+ fpath .parent .mkdir (parents = True , exist_ok = True )
331+ fpath .write_bytes (response .content )
317332 else :
318- raise urllib .error .HTTPError (
319- f_url ,
320- response .status_code ,
321- f"Error for data file { filename !r} " ,
322- response .headers , # type: ignore
323- None ,
324- )
325-
326- return True
333+ shutil .rmtree (output_pth ) # clean up partial download
334+ return False , f"Failed to download { data_file } : { response .status_code } "
335+
336+ return True , ""
0 commit comments