Skip to content
Open
17 changes: 7 additions & 10 deletions .scripts/ci/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ def main(args: argparse.Namespace) -> None:
from anndata import AnnData

import squidpy as sq
from squidpy.datasets._downloader import get_downloader

downloader = get_downloader()
registry = downloader.registry
from squidpy.datasets._registry import dataset_names

# Visium samples tested in CI
visium_samples_to_cache = [
Expand All @@ -35,23 +32,23 @@ def main(args: argparse.Namespace) -> None:
logger.info("Cache: %s", settings.datasetdir)
logger.info(
"Would download: %d AnnData, %d images, %d SpatialData, %d Visium",
len(registry.anndata_datasets),
len(registry.image_datasets),
len(registry.spatialdata_datasets),
len(dataset_names("anndata")),
len(dataset_names("image")),
len(dataset_names("spatialdata")),
len(visium_samples_to_cache),
)
return

# Download all datasets - the downloader handles caching
for name in registry.anndata_datasets:
for name in dataset_names("anndata"):
obj = getattr(sq.datasets, name)()
assert isinstance(obj, AnnData)

for name in registry.image_datasets:
for name in dataset_names("image"):
obj = getattr(sq.datasets, name)()
assert isinstance(obj, sq.im.ImageContainer)

for name in registry.spatialdata_datasets:
for name in dataset_names("spatialdata"):
getattr(sq.datasets, name)()

for sample in visium_samples_to_cache:
Expand Down
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@ dependencies = [
"omnipath>=1.0.7",
"pandas>=2.1",
"pillow>=8",
"pooch>=1.6",
"pyyaml>=6",
"pooch>=1.6", # used directly (pooch.Untar) in the visium loader
"scanpy>=1.9.3",
"scikit-image>=0.25",
# due to https://github.com/scikit-image/scikit-image/issues/6850 breaks rescale ufunc
"scikit-learn>=0.24",
"spatialdata>=0.7.2", # 0.7.2 dropped xarray-schema (pkg_resources break, #1115)
# dataset registry + downloader now live in scverse-misc
"scverse-misc[datasets]>=0.1.1",
"spatialdata>=0.7.2", # 0.7.2 dropped xarray-schema (pkg_resources break, #1115)

@selmanozleyen selmanozleyen Jun 26, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the diff here, can you undo it?

"spatialdata>=0.7.2",            # 0.7.2 dropped xarray-schema (pkg_resources break, #1115)

"spatialdata-plot>=0.3.3",
"statsmodels>=0.12",
# https://github.com/scverse/squidpy/issues/526
Expand Down
40 changes: 14 additions & 26 deletions src/squidpy/datasets/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,8 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

from scanpy import settings

from squidpy.datasets._downloader import get_downloader
from squidpy.datasets._registry import DatasetType, get_registry
from squidpy.datasets._downloader import download
from squidpy.datasets._registry import dataset_names, get_registry
from squidpy.read._utils import PathLike

if TYPE_CHECKING:
Expand Down Expand Up @@ -122,19 +120,11 @@ def visium(
:class:`anndata.AnnData`
Spatial AnnData object.
"""
# Validate sample_id against known names
downloader = get_downloader()

if sample_id not in downloader.registry:
msg = f"Unknown Visium sample: {sample_id}. "
msg += f"Available samples: {downloader.registry.visium_datasets}"
raise ValueError(msg)

# Use scanpy.settings.datasetdir/visium if base_dir not specified
if base_dir is None:
base_dir = Path(settings.datasetdir) / "visium"
if sample_id not in get_registry():
raise ValueError(f"Unknown Visium sample: {sample_id}. Available samples: {dataset_names('visium_10x')}")
Comment on lines +123 to +124

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

valid-but-wrong-type name (e.g. imc, an AnnData dataset) passes the guard, then fails deep inside the anndata loader with a confusing TypeError: read_h5ad() got an unexpected keyword argument 'include_hires_tiff' instead of a clear error.

Suggested change
if sample_id not in get_registry():
raise ValueError(f"Unknown Visium sample: {sample_id}. Available samples: {dataset_names('visium_10x')}")
visium_samples = dataset_names("visium_10x")
if sample_id not in visium_samples:
raise ValueError(f"Unknown Visium sample: {sample_id}. Available samples: {visium_samples}")


return downloader.download(sample_id, base_dir, include_hires_tiff=include_hires_tiff)
# downloads land in <datasetdir>/visium_10x/<sample_id>/

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this used to be /x/visium/x/ on old code but it's fine I guess to move the cache path. It's also seen in

-    expected_image_path = (Path(settings.datasetdir) / "visium" / sample / "image.tif").resolve()
+    expected_image_path = (Path(settings.datasetdir) / "visium_10x" / sample / "image.tif").resolve()

but good to document

return download(sample_id, base_dir, include_hires_tiff=include_hires_tiff)


def visium_hne_sdata(folderpath: Path | str | None = None) -> sd.SpatialData:
Expand All @@ -152,8 +142,7 @@ def visium_hne_sdata(folderpath: Path | str | None = None) -> sd.SpatialData:
:class:`spatialdata.SpatialData`
The downloaded and extracted Visium H&E dataset.
"""
downloader = get_downloader()
return downloader.download("visium_hne_sdata", folderpath)
return download("visium_hne_sdata", folderpath)


def cells(folderpath: Path | str | None = None) -> sd.SpatialData:
Expand All @@ -171,8 +160,7 @@ def cells(folderpath: Path | str | None = None) -> sd.SpatialData:
:class:`spatialdata.SpatialData`
The downloaded and extracted cells dataset.
"""
downloader = get_downloader()
return downloader.download("cells", folderpath)
return download("cells", folderpath)


# =============================================================================
Expand Down Expand Up @@ -204,9 +192,9 @@ class _DocParts:
return_type=":class:`squidpy.im.ImageContainer`\n The image data.",
)

_DOC_PARTS_BY_TYPE: dict[DatasetType, _DocParts] = {
DatasetType.ANNDATA: _ANNDATA_DOC,
DatasetType.IMAGE: _IMAGE_DOC,
_DOC_PARTS_BY_TYPE: dict[str, _DocParts] = {
"anndata": _ANNDATA_DOC,
"image": _IMAGE_DOC,
}


Expand All @@ -225,12 +213,12 @@ def _make_loader(dataset_name: str):
raise ValueError(f"Unsupported type for loader factory: {entry.type}")

def loader(path: PathLike | None = None, **kwargs: Any):
return get_downloader().download(dataset_name, path, **kwargs)
return download(dataset_name, path, **kwargs)

loader.__doc__ = f"""
{entry.doc_header}
{entry.metadata.get("doc_header")}

{doc_parts.shape_prefix} ``{entry.shape}``.
{doc_parts.shape_prefix} ``{entry.metadata.get("shape")}``.

Parameters
----------
Expand Down
Loading
Loading