Skip to content

Commit e0d1fda

Browse files
committed
feat: allow downloading datasets using RemoteDataBase
1 parent 29505ff commit e0d1fda

File tree

3 files changed

+118
-62
lines changed

3 files changed

+118
-62
lines changed

examples/remote_database.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""
2+
Example script demonstrating how to use the RemoteDataBase class to fetch and load datasets
3+
from a remote HDX-MS database.
4+
"""
5+
6+
# %%
7+
8+
from hdxms_datasets import RemoteDataBase
9+
from pathlib import Path
10+
11+
DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDXMS-database/master/datasets/"
12+
13+
# %%
14+
15+
# create a local directory to store fetched datasets
16+
database_dir = Path().cwd() / "datasets"
17+
database_dir.mkdir(parents=True, exist_ok=True)
18+
19+
# connect to a remote database
20+
# omit the remote_url parameter to use the default
21+
# creating the db will automatically fetch the datasets catalog
22+
db = RemoteDataBase(database_dir, remote_url=DATABASE_URL)
23+
db.datasets_catalog.to_native()
24+
25+
# %%
26+
# fetch the first available dataset, if successful its saved to `database_dir`
27+
data_id = db.remote_datasets[0]
28+
success, message = db.fetch_dataset(data_id)
29+
success, message
30+
31+
# %%
32+
# load the dataset from disk
33+
dataset = db.load_dataset(data_id)
34+
dataset
35+
# %%

hdxms_datasets/database.py

Lines changed: 71 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
from pathlib import Path
22

3-
import urllib
3+
from urllib.parse import urljoin
4+
from urllib.error import HTTPError
45
import uuid
56

67
import requests
7-
from hdxms_datasets.loader import BACKEND
8-
from hdxms_datasets.models import HDXDataSet
8+
from hdxms_datasets.loader import BACKEND, read_csv
9+
from hdxms_datasets.models import HDXDataSet, extract_values_by_types
910
import shutil
1011
import narwhals as nw
1112

1213
from hdxms_datasets.utils import records_to_dict
1314
from hdxms_datasets.verification import verify_dataset
1415

1516

17+
CATALOG_FILE = "datasets_catalog.csv"
18+
DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDXMS-database/master/datasets/"
19+
20+
1621
def load_dataset(pth: Path) -> HDXDataSet:
1722
"""
1823
Load a dataset from a JSON file or directory.
@@ -235,9 +240,6 @@ def load_dataset(self, dataset_id: str) -> HDXDataSet:
235240
return dataset
236241

237242

238-
DATABASE_URL = "https://raw.githubusercontent.com/Jhsmit/HDX-MS-datasets/master/datasets/"
239-
240-
241243
class RemoteDataBase(DataBase):
242244
"""
243245
A database for HDX-MS datasets, with the ability to fetch datasets from a remote repository.
@@ -247,80 +249,88 @@ class RemoteDataBase(DataBase):
247249
remote_url: URL of the remote repository (default: DATABASE_URL).
248250
"""
249251

250-
def __init__(self, database_dir: Path | str, remote_url: str = DATABASE_URL):
252+
def __init__(
253+
self,
254+
database_dir: Path | str,
255+
remote_url: str = DATABASE_URL,
256+
):
251257
super().__init__(database_dir)
252258
self.remote_url = remote_url
253259

254-
def get_index(self) -> nw.DataFrame:
255-
"""Retrieves the index of available datasets
260+
index_url = urljoin(DATABASE_URL, CATALOG_FILE)
261+
response = requests.get(index_url)
262+
263+
# TODO keep catalogs on a per-url basis in a singleton
264+
if response.ok:
265+
df = read_csv(response.content)
266+
self.datasets_catalog = df
267+
else:
268+
raise HTTPError(
269+
index_url,
270+
response.status_code,
271+
"Error fetching dataset index",
272+
response.headers, # type: ignore
273+
None,
274+
)
256275

257-
on success, returns the index dataframe and
258-
stores as `remote_index` attribute.
276+
@property
277+
def remote_datasets(self) -> list[str]:
278+
"""List of available datasets in the remote repository"""
279+
return self.datasets_catalog["id"].to_list()
259280

260-
"""
261-
raise NotImplementedError()
281+
@property
282+
def local_datasets(self) -> list[str]:
283+
"""List of available datasets in the local database directory"""
284+
return self.datasets
262285

263-
def fetch_dataset(self, data_id: str) -> bool:
286+
def fetch_dataset(self, data_id: str) -> tuple[bool, str]:
264287
"""
265-
Download a dataset from the online repository to the cache dir
288+
Download a dataset from the online repository to `database_dir`
266289
267290
Args:
268291
data_id: The ID of the dataset to download.
269292
270293
Returns:
271-
`True` if the dataset was downloaded successfully, `False` otherwise.
294+
A tuple (success: bool, message: str):
295+
- success: True if the dataset was successfully downloaded, False otherwise.
296+
- message: A message indicating the result of the download.
272297
"""
273298

274-
raise NotImplementedError()
275-
output_pth = self.cache_dir / data_id
276-
if output_pth.exists():
277-
return False
278-
else:
279-
output_pth.mkdir()
280-
281-
dataset_url = urllib.parse.urljoin(self.remote_url, data_id + "/")
282-
283-
files = ["hdx_spec.yaml", "metadata.yaml"]
284-
hdx_spec = None
285-
for f in files + optional_files:
286-
url = urllib.parse.urljoin(dataset_url, f)
287-
response = requests.get(url)
299+
if data_id not in self.remote_datasets:
300+
return False, f"Dataset ID {data_id!r} not found in remote database."
288301

289-
if response.ok:
290-
(output_pth / f).write_bytes(response.content)
291-
292-
elif f in files:
293-
raise urllib.error.HTTPError(
294-
url,
295-
response.status_code,
296-
f"Error for file {f!r}",
297-
response.headers, # type: ignore
298-
None,
299-
)
302+
json_url = urljoin(DATABASE_URL, data_id + "/dataset.json")
303+
response = requests.get(json_url)
300304

301-
if f == "hdx_spec.yaml":
302-
hdx_spec = yaml.safe_load(response.text)
305+
# confirm if the json is according to spec
306+
try:
307+
dataset = HDXDataSet.model_validate_json(
308+
response.content,
309+
)
310+
except Exception as e:
311+
return False, f"Error validating dataset JSON: {e}"
303312

304-
if hdx_spec is None:
305-
raise ValueError(f"Could not find HDX spec for data_id {data_id!r}")
313+
# create a list of all Path objects in the dataset plus the dataset.json file
314+
data_files = list(set(extract_values_by_types(dataset, Path))) + [Path("dataset.json")]
306315

307-
data_pth = output_pth / "data"
308-
data_pth.mkdir()
316+
# create the target directory to store the dataset
317+
output_pth = self.database_dir / data_id
318+
if output_pth.exists():
319+
return False, "Dataset already exists in the local database."
320+
else:
321+
output_pth.mkdir()
309322

310-
for file_spec in hdx_spec["data_files"].values():
311-
filename = file_spec["filename"]
312-
f_url = urllib.parse.urljoin(dataset_url, filename)
313-
response = requests.get(f_url)
323+
for data_file in data_files:
324+
data_url = urljoin(DATABASE_URL, data_id + "/" + data_file.as_posix())
314325

326+
response = requests.get(data_url)
315327
if response.ok:
316-
(output_pth / filename).write_bytes(response.content)
328+
# write the file to disk
329+
fpath = output_pth / Path(data_file)
330+
fpath.parent.mkdir(parents=True, exist_ok=True)
331+
fpath.write_bytes(response.content)
317332
else:
318-
raise urllib.error.HTTPError(
319-
f_url,
320-
response.status_code,
321-
f"Error for data file {filename!r}",
322-
response.headers, # type: ignore
323-
None,
324-
)
325-
326-
return True
333+
shutil.rmtree(output_pth) # clean up partial download
334+
return False, f"Failed to download {data_file}: {response.status_code}"
335+
336+
return True, ""

tests/test_hdxms_datasets.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
This replaces the old YAML-based tests with tests for the new API.
66
"""
77

8-
from hdxms_datasets.database import DataBase, load_dataset
8+
from hdxms_datasets.database import DataBase, RemoteDataBase, load_dataset
99
from hdxms_datasets.models import HDXDataSet
1010
from pathlib import Path
1111
import pytest
@@ -68,6 +68,17 @@ def test_database_functionality(database: DataBase):
6868
assert isinstance(dataset, HDXDataSet)
6969

7070

71+
def test_remote_database(tmp_path: Path):
72+
db = RemoteDataBase(tmp_path)
73+
assert DATA_ID in db.remote_datasets
74+
75+
success, message = db.fetch_dataset(DATA_ID)
76+
assert success, message
77+
78+
dataset = db.load_dataset(DATA_ID)
79+
assert isinstance(dataset, HDXDataSet)
80+
81+
7182
def test_peptide_loading(dataset: HDXDataSet):
7283
"""Test that peptides can be loaded and have expected structure"""
7384
state = dataset.states[0]

0 commit comments

Comments
 (0)