feat: allow downloading datasets using RemoteDataBase

Jhsmit · Jhsmit · commit e0d1fdaf531c · 2025-09-17T16:10:03.000+02:00
diff --git a/examples/remote_database.py b/examples/remote_database.py
@@ -0,0 +1,35 @@
+"""
+Example script demonstrating how to use the RemoteDataBase class to fetch and load datasets
+from a remote HDX-MS database.
+"""
+
+# %%
+
+from hdxms_datasets import RemoteDataBase
+from pathlib import Path
+
+DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDXMS-database/master/datasets/"
+
+# %%
+
+# create a local directory to store fetched datasets
+database_dir = Path().cwd() / "datasets"
+database_dir.mkdir(parents=True, exist_ok=True)
+
+# connect to a remote database
+# omit the remote_url parameter to use the default
+# creating the db will automatically fetch the datasets catalog
+db = RemoteDataBase(database_dir, remote_url=DATABASE_URL)
+db.datasets_catalog.to_native()
+
+# %%
+# fetch the first available dataset, if successful its saved to `database_dir`
+data_id = db.remote_datasets[0]
+success, message = db.fetch_dataset(data_id)
+success, message
+
+# %%
+# load the dataset from disk
+dataset = db.load_dataset(data_id)
+dataset
+# %%
diff --git a/hdxms_datasets/database.py b/hdxms_datasets/database.py
@@ -1,18 +1,23 @@
 from pathlib import Path
 
-import urllib
+from urllib.parse import urljoin
+from urllib.error import HTTPError
 import uuid
 
 import requests
-from hdxms_datasets.loader import BACKEND
-from hdxms_datasets.models import HDXDataSet
+from hdxms_datasets.loader import BACKEND, read_csv
+from hdxms_datasets.models import HDXDataSet, extract_values_by_types
 import shutil
 import narwhals as nw
 
 from hdxms_datasets.utils import records_to_dict
 from hdxms_datasets.verification import verify_dataset
 
 
+CATALOG_FILE = "datasets_catalog.csv"
+DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDXMS-database/master/datasets/"
+
+
 def load_dataset(pth: Path) -> HDXDataSet:
     """
     Load a dataset from a JSON file or directory.
@@ -235,9 +240,6 @@ def load_dataset(self, dataset_id: str) -> HDXDataSet:
         return dataset
 
 
-DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/"
-
-
 class RemoteDataBase(DataBase):
     """
     A database for HDX-MS datasets, with the ability to fetch datasets from a remote repository.
@@ -247,80 +249,88 @@ class RemoteDataBase(DataBase):
         remote_url: URL of the remote repository (default: DATABASE_URL).
     """
 
-    def __init__(self, database_dir: Path | str, remote_url: str = DATABASE_URL):
+    def __init__(
+        self,
+        database_dir: Path | str,
+        remote_url: str = DATABASE_URL,
+    ):
         super().__init__(database_dir)
         self.remote_url = remote_url
 
-    def get_index(self) -> nw.DataFrame:
-        """Retrieves the index of available datasets
+        index_url = urljoin(DATABASE_URL, CATALOG_FILE)
+        response = requests.get(index_url)
+
+        # TODO keep catalogs on a per-url basis in a singleton
+        if response.ok:
+            df = read_csv(response.content)
+            self.datasets_catalog = df
+        else:
+            raise HTTPError(
+                index_url,
+                response.status_code,
+                "Error fetching dataset index",
+                response.headers,  # type: ignore
+                None,
+            )
 
-        on success, returns the index dataframe and
-        stores as `remote_index` attribute.
+    @property
+    def remote_datasets(self) -> list[str]:
+        """List of available datasets in the remote repository"""
+        return self.datasets_catalog["id"].to_list()
 
-        """
-        raise NotImplementedError()
+    @property
+    def local_datasets(self) -> list[str]:
+        """List of available datasets in the local database directory"""
+        return self.datasets
 
-    def fetch_dataset(self, data_id: str) -> bool:
+    def fetch_dataset(self, data_id: str) -> tuple[bool, str]:
         """
-        Download a dataset from the online repository to the cache dir
+        Download a dataset from the online repository to `database_dir`
 
         Args:
             data_id: The ID of the dataset to download.
 
         Returns:
-            `True` if the dataset was downloaded successfully, `False`  otherwise.
+            A tuple (success: bool, message: str):
+            - success: True if the dataset was successfully downloaded, False otherwise.
+            - message: A message indicating the result of the download.
         """
 
-        raise NotImplementedError()
-        output_pth = self.cache_dir / data_id
-        if output_pth.exists():
-            return False
-        else:
-            output_pth.mkdir()
-
-        dataset_url = urllib.parse.urljoin(self.remote_url, data_id + "/")
-
-        files = ["hdx_spec.yaml", "metadata.yaml"]
-        hdx_spec = None
-        for f in files + optional_files:
-            url = urllib.parse.urljoin(dataset_url, f)
-            response = requests.get(url)
+        if data_id not in self.remote_datasets:
+            return False, f"Dataset ID {data_id!r} not found in remote database."
 
-            if response.ok:
-                (output_pth / f).write_bytes(response.content)
-
-            elif f in files:
-                raise urllib.error.HTTPError(
-                    url,
-                    response.status_code,
-                    f"Error for file {f!r}",
-                    response.headers,  # type: ignore
-                    None,
-                )
+        json_url = urljoin(DATABASE_URL, data_id + "/dataset.json")
+        response = requests.get(json_url)
 
-            if f == "hdx_spec.yaml":
-                hdx_spec = yaml.safe_load(response.text)
+        # confirm if the json is according to spec
+        try:
+            dataset = HDXDataSet.model_validate_json(
+                response.content,
+            )
+        except Exception as e:
+            return False, f"Error validating dataset JSON: {e}"
 
-        if hdx_spec is None:
-            raise ValueError(f"Could not find HDX spec for data_id {data_id!r}")
+        # create a list of all Path objects in the dataset plus the dataset.json file
+        data_files = list(set(extract_values_by_types(dataset, Path))) + [Path("dataset.json")]
 
-        data_pth = output_pth / "data"
-        data_pth.mkdir()
+        # create the target directory to store the dataset
+        output_pth = self.database_dir / data_id
+        if output_pth.exists():
+            return False, "Dataset already exists in the local database."
+        else:
+            output_pth.mkdir()
 
-        for file_spec in hdx_spec["data_files"].values():
-            filename = file_spec["filename"]
-            f_url = urllib.parse.urljoin(dataset_url, filename)
-            response = requests.get(f_url)
+        for data_file in data_files:
+            data_url = urljoin(DATABASE_URL, data_id + "/" + data_file.as_posix())
 
+            response = requests.get(data_url)
             if response.ok:
-                (output_pth / filename).write_bytes(response.content)
+                # write the file to disk
+                fpath = output_pth / Path(data_file)
+                fpath.parent.mkdir(parents=True, exist_ok=True)
+                fpath.write_bytes(response.content)
             else:
-                raise urllib.error.HTTPError(
-                    f_url,
-                    response.status_code,
-                    f"Error for data file {filename!r}",
-                    response.headers,  # type: ignore
-                    None,
-                )
-
-        return True
+                shutil.rmtree(output_pth)  # clean up partial download
+                return False, f"Failed to download {data_file}: {response.status_code}"
+
+        return True, ""
diff --git a/tests/test_hdxms_datasets.py b/tests/test_hdxms_datasets.py
@@ -5,7 +5,7 @@
 This replaces the old YAML-based tests with tests for the new API.
 """
 
-from hdxms_datasets.database import DataBase, load_dataset
+from hdxms_datasets.database import DataBase, RemoteDataBase, load_dataset
 from hdxms_datasets.models import HDXDataSet
 from pathlib import Path
 import pytest
@@ -68,6 +68,17 @@ def test_database_functionality(database: DataBase):
     assert isinstance(dataset, HDXDataSet)
 
 
+def test_remote_database(tmp_path: Path):
+    db = RemoteDataBase(tmp_path)
+    assert DATA_ID in db.remote_datasets
+
+    success, message = db.fetch_dataset(DATA_ID)
+    assert success, message
+
+    dataset = db.load_dataset(DATA_ID)
+    assert isinstance(dataset, HDXDataSet)
+
+
 def test_peptide_loading(dataset: HDXDataSet):
     """Test that peptides can be loaded and have expected structure"""
     state = dataset.states[0]