Skip to content

Commit 2b2e593

Browse files
Added function to download Visium as SpatialData (#949)
* Added function to download example SpatialData * self-review * added test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated dataset id * fixed tox.ini * attempt to include sdata download * added func import to _datasets so __all__ can see it * attempt to hack in sdata download 2 * debug output for runner * adjusted function * refactored out download/extract code * updated failing image test * made photometric parameter robust to newest tifffile release --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 701d585 commit 2b2e593

File tree

7 files changed

+130
-8
lines changed

7 files changed

+130
-8
lines changed

.scripts/ci/download_data.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from pathlib import Path
66
from typing import Any
77

8+
from squidpy.datasets import visium_hne_sdata
9+
810
_CNT = 0 # increment this when you want to rebuild the CI cache
911
_ROOT = Path.home() / ".cache" / "squidpy"
1012

@@ -39,14 +41,25 @@ def main(args: argparse.Namespace) -> None:
3941

4042
if args.dry_run:
4143
for func_name, ext in zip(all_datasets, all_extensions):
44+
if func_name == "visium_hne_sdata":
45+
ext = "zarr"
4246
path = _ROOT / f"{func_name}.{ext}"
4347
_print_message(func_name, path, dry_run=True)
4448
return
4549

4650
# could be parallelized, but on CI it largely does not matter (usually limited to 2 cores + bandwidth limit)
4751
for func_name, ext in zip(all_datasets, all_extensions):
48-
path = _ROOT / f"{func_name}.{ext}"
52+
if func_name == "visium_hne_sdata":
53+
ext = "zarr"
54+
path = _ROOT / f"{func_name}.{ext}"
4955

56+
_print_message(func_name, path)
57+
obj = visium_hne_sdata(_ROOT)
58+
59+
assert path.is_dir(), f"Expected a .zarr folder at {path}"
60+
continue
61+
62+
path = _ROOT / f"{func_name}.{ext}"
5063
_print_message(func_name, path)
5164
obj = _maybe_download_data(func_name, path)
5265

src/squidpy/datasets/_10x_datasets.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import os
4+
import shutil
35
import tarfile
46
from pathlib import Path
57
from typing import (
@@ -8,12 +10,16 @@
810
Union, # noqa: F401
911
)
1012

13+
import spatialdata as sd
1114
from anndata import AnnData
1215
from scanpy import _utils
16+
from scanpy import logging as logg
1317
from scanpy._settings import settings
18+
from scanpy._utils import check_presence_download
19+
from spatialdata import SpatialData
1420

1521
from squidpy._constants._constants import TenxVersions
16-
from squidpy.datasets._utils import PathLike
22+
from squidpy.datasets._utils import DEFAULT_CACHE_DIR, PathLike, _get_zipped_dataset
1723

1824
__all__ = ["visium"]
1925

@@ -106,7 +112,9 @@ def visium(
106112

107113
url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/"
108114
visium_files = VisiumFiles(
109-
f"{sample_id}_filtered_feature_bc_matrix.h5", f"{sample_id}_spatial.tar.gz", f"{sample_id}_image.tif"
115+
f"{sample_id}_filtered_feature_bc_matrix.h5",
116+
f"{sample_id}_spatial.tar.gz",
117+
f"{sample_id}_image.tif",
110118
)
111119

112120
# download spatial data
@@ -134,3 +142,38 @@ def visium(
134142
)
135143

136144
return read_visium(base_dir / sample_id)
145+
146+
147+
def visium_hne_sdata(folderpath: Path | str | None = None) -> sd.SpatialData:
148+
"""
149+
Downloads a Visium H&E dataset into a specified folder and returns it as a `SpatialData` object.
150+
151+
It downloads and extracts the dataset into:
152+
- `<folderpath>/visium_hne_sdata.zip` for the compressed file
153+
- `<folderpath>/visium_hne_sdata.zarr` for the extracted dataset
154+
155+
Parameters
156+
----------
157+
folderpath : Path | str
158+
A folder path where the dataset will be downloaded and extracted. The resulting `.zarr`
159+
folder is used to load the `SpatialData` object.
160+
161+
Returns
162+
-------
163+
SpatialData
164+
The downloaded and extracted Visium H&E dataset as a `SpatialData` object.
165+
"""
166+
167+
FIGSHARE_ID = "52370645"
168+
DATASET_NAME = "visium_hne_sdata"
169+
170+
if folderpath is None:
171+
folderpath = DEFAULT_CACHE_DIR
172+
else:
173+
folderpath = Path(folderpath).expanduser().absolute()
174+
175+
return _get_zipped_dataset(
176+
folderpath=folderpath,
177+
dataset_name=DATASET_NAME,
178+
figshare_id=FIGSHARE_ID,
179+
)

src/squidpy/datasets/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import annotations
22

3-
from squidpy.datasets._10x_datasets import visium
3+
from squidpy.datasets._10x_datasets import visium, visium_hne_sdata
44
from squidpy.datasets._dataset import * # noqa: F403
55
from squidpy.datasets._image import * # noqa: F403

src/squidpy/datasets/_dataset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from copy import copy
44

5+
from squidpy.datasets._10x_datasets import visium_hne_sdata
56
from squidpy.datasets._utils import AMetadata
67

78
_4i = AMetadata(
@@ -90,6 +91,7 @@
9091
"seqfish",
9192
"visium_hne_adata",
9293
"visium_hne_adata_crop",
94+
"visium_hne_sdata",
9395
"visium_fluo_adata",
9496
"visium_fluo_adata_crop",
9597
"sc_mouse_cortex",

src/squidpy/datasets/_utils.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import os
4+
import shutil
45
from abc import ABC, abstractmethod
56
from collections.abc import Callable, Sequence
67
from dataclasses import dataclass, field
@@ -9,13 +10,15 @@
910
from typing import Any, TypeAlias, Union
1011

1112
import anndata
13+
import spatialdata as sd
1214
from anndata import AnnData
1315
from scanpy import logging as logg
1416
from scanpy import read
1517
from scanpy._utils import check_presence_download
1618

1719
PathLike: TypeAlias = os.PathLike[str] | str
1820
Function_t: TypeAlias = Callable[..., AnnData | Any]
21+
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "squidpy"
1922

2023

2124
@dataclass(frozen=True)
@@ -177,3 +180,42 @@ def _download(self, fpath: PathLike, backup_url: str, **kwargs: Any) -> Any:
177180
@property
178181
def _extension(self) -> str:
179182
return ".tiff"
183+
184+
185+
def _get_zipped_dataset(folderpath: Path, dataset_name: str, figshare_id: str) -> sd.SpatialData:
186+
"""Returns a specific dataset as SpatialData object. If the file is not present on disk, it will be downloaded and extracted."""
187+
188+
if not folderpath.is_dir():
189+
raise ValueError(f"Expected a directory path for `folderpath`, found: {folderpath}")
190+
191+
download_zip = folderpath / f"{dataset_name}.zip"
192+
extracted_path = folderpath / f"{dataset_name}.zarr"
193+
194+
# Return early if data is already extracted
195+
if extracted_path.exists():
196+
logg.info(f"Loading existing dataset from {extracted_path}")
197+
return sd.read_zarr(extracted_path)
198+
199+
# Download if necessary
200+
if not download_zip.exists():
201+
logg.info(f"Downloading Visium H&E SpatialData to {download_zip}")
202+
try:
203+
check_presence_download(
204+
filename=download_zip,
205+
backup_url=f"https://ndownloader.figshare.com/files/{figshare_id}",
206+
)
207+
except Exception as e:
208+
raise RuntimeError(f"Failed to download dataset: {e}") from e
209+
210+
# Extract if necessary
211+
if not extracted_path.exists():
212+
logg.info(f"Extracting dataset from {download_zip} to {extracted_path}")
213+
try:
214+
shutil.unpack_archive(str(download_zip), folderpath)
215+
except Exception as e:
216+
raise RuntimeError(f"Failed to extract dataset: {e}") from e
217+
218+
if not extracted_path.exists():
219+
raise RuntimeError(f"Expected extracted data at {extracted_path}, but not found")
220+
221+
return sd.read_zarr(extracted_path)

tests/datasets/test_download_visium_dataset.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,22 @@
88
from pathlib import Path
99

1010
import pytest
11+
import spatialdata as sd
1112
from anndata.tests.helpers import assert_adata_equal
1213
from scanpy._settings import settings
1314

14-
from squidpy.datasets import visium
15+
from squidpy.datasets import visium, visium_hne_sdata
1516

1617

1718
@pytest.mark.timeout(120)
1819
@pytest.mark.internet()
1920
@pytest.mark.parametrize(
20-
"sample", ["V1_Mouse_Kidney", "Targeted_Visium_Human_SpinalCord_Neuroscience", "Visium_FFPE_Human_Breast_Cancer"]
21+
"sample",
22+
[
23+
"V1_Mouse_Kidney",
24+
"Targeted_Visium_Human_SpinalCord_Neuroscience",
25+
"Visium_FFPE_Human_Breast_Cancer",
26+
],
2127
)
2228
def test_visium_datasets(tmpdir, sample):
2329
# Tests that reading / downloading datasets works and it does not have any global effects
@@ -43,3 +49,12 @@ def test_visium_datasets(tmpdir, sample):
4349
process = subprocess.run(["file", "--mime-type", image_path], stdout=subprocess.PIPE)
4450
output = process.stdout.strip().decode() # make process output string
4551
assert output == str(image_path) + ": image/tiff"
52+
53+
54+
@pytest.mark.timeout(120)
55+
@pytest.mark.internet()
56+
def test_visium_sdata_dataset(tmpdir):
57+
sdata = visium_hne_sdata(Path(tmpdir))
58+
assert isinstance(sdata, sd.SpatialData)
59+
assert list(sdata.shapes.keys()) == ["spots"]
60+
assert list(sdata.images.keys()) == ["hne"]

tests/image/test_io.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,15 @@ class TestIO:
1717
def _create_image(path: str, shape: tuple[int, ...]):
1818
dtype = np.uint8 if len(shape) <= 3 else np.float32
1919
img = np.random.randint(0, 255, size=shape).astype(dtype)
20-
# set `photometric` to remove warnings
21-
tifffile.imwrite(path, img, photometric=tifffile.TIFF.PHOTOMETRIC.MINISBLACK)
20+
21+
try:
22+
# Old usage (works up to tifffile<2025.2.18)
23+
photometric = tifffile.TIFF.PHOTOMETRIC.MINISBLACK
24+
except AttributeError:
25+
# New usage (works on tifffile >=2025.2.18)
26+
photometric = "MINISBLACK"
27+
28+
tifffile.imwrite(path, img, photometric=photometric)
2229

2330
return img
2431

0 commit comments

Comments
 (0)