Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1f0ef4e
init
selmanozleyen Dec 6, 2025
0a063ce
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 6, 2025
b301fdc
Merge branch 'main' into add-dataset-hashes
selmanozleyen Dec 6, 2025
2f7cd58
linter errors
selmanozleyen Dec 6, 2025
63a5d58
readthedocs fix
selmanozleyen Dec 6, 2025
a2cb8bc
extension bug fix
selmanozleyen Dec 7, 2025
8e3685e
Merge branch 'main' into add-dataset-hashes
selmanozleyen Dec 7, 2025
03288db
use cache dir
selmanozleyen Dec 8, 2025
b06f7a7
all downloads cache to squidpy default. Don't use scanpy default sinc…
selmanozleyen Dec 8, 2025
e7f8399
format
selmanozleyen Dec 8, 2025
dc21ced
add docs
selmanozleyen Dec 8, 2025
5f75195
PathLike refactor
selmanozleyen Dec 8, 2025
001a733
redirect notebooks to the correct module
selmanozleyen Dec 8, 2025
ec6b02b
update script
selmanozleyen Dec 8, 2025
f086493
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 8, 2025
7ce86a3
since we have the hash of downloaded files we don't need to update fo…
selmanozleyen Dec 8, 2025
b4a0074
Merge branch 'add-dataset-hashes' of https://github.com/selmanozleyen…
selmanozleyen Dec 8, 2025
2adb236
update script
selmanozleyen Dec 9, 2025
09e3cba
format
selmanozleyen Dec 9, 2025
d51b08c
remove agent spoofing
selmanozleyen Dec 9, 2025
8e99d3c
remove fallbacks
selmanozleyen Dec 9, 2025
383c820
if path is not None
selmanozleyen Dec 9, 2025
adc9d53
more structured logic
selmanozleyen Dec 9, 2025
d38caf2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,15 @@ jobs:
id: data-cache
uses: actions/cache@v4
with:
path: |
~/.cache/squidpy/*.h5ad
~/.cache/squidpy/*.zarr
path: ~/.cache/squidpy
key: data-${{ hashFiles('**/download_data.py') }}
restore-keys: |
data-
enableCrossOsArchive: true

- name: Download datasets
if: steps.data-cache.outputs.cache-hit != 'true'
# Always run to ensure any missing files are downloaded
# (restore-keys may provide partial cache)
run: uvx hatch run data:download

# Get the test environment from hatch as defined in pyproject.toml.
Expand Down Expand Up @@ -122,10 +123,10 @@ jobs:
id: data-cache
uses: actions/cache@v4
with:
path: |
~/.cache/squidpy/*.h5ad
~/.cache/squidpy/*.zarr
path: ~/.cache/squidpy
key: data-${{ hashFiles('**/download_data.py') }}
restore-keys: |
data-
enableCrossOsArchive: true

- name: System dependencies (Linux)
Expand Down Expand Up @@ -181,10 +182,10 @@ jobs:
id: coverage-data-cache
uses: actions/cache@v4
with:
path: |
~/.cache/squidpy/*.h5ad
~/.cache/squidpy/*.zarr
path: ~/.cache/squidpy
key: data-${{ hashFiles('**/download_data.py') }}
restore-keys: |
data-
enableCrossOsArchive: true

- name: System dependencies (Linux)
Expand Down
92 changes: 43 additions & 49 deletions .scripts/ci/download_data.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,71 @@
#!/usr/bin/env python3
"""Download datasets to populate CI cache.

This script downloads all datasets that tests might need.
The downloader handles caching to DEFAULT_CACHE_DIR (~/.cache/squidpy).
"""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import Any

from squidpy.datasets import visium_hne_sdata
import logging

_CNT = 0 # increment this when you want to rebuild the CI cache
_ROOT = Path.home() / ".cache" / "squidpy"


def _print_message(func_name: str, path: Path, *, dry_run: bool = False) -> None:
prefix = "[DRY RUN]" if dry_run else ""
if path.is_file():
print(f"{prefix}[Loading] {func_name:>25} <- {str(path):>25}")
else:
print(f"{prefix}[Downloading] {func_name:>25} -> {str(path):>25}")


def _maybe_download_data(func_name: str, path: Path) -> Any:
import squidpy as sq

try:
return getattr(sq.datasets, func_name)(path=path)
except Exception as e: # noqa: BLE001
print(f"File {str(path):>25} seems to be corrupted: {e}. Removing and retrying")
path.unlink()

return getattr(sq.datasets, func_name)(path=path)
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)


def main(args: argparse.Namespace) -> None:
from anndata import AnnData

import squidpy as sq
from squidpy.datasets._downloader import DEFAULT_CACHE_DIR
from squidpy.datasets._registry import get_registry

all_datasets = sq.datasets._dataset.__all__ + sq.datasets._image.__all__
all_extensions = ["h5ad"] * len(sq.datasets._dataset.__all__) + ["tiff"] * len(sq.datasets._image.__all__)
registry = get_registry()

# Visium samples tested in CI
visium_samples_to_cache = [
"V1_Mouse_Kidney",
"Targeted_Visium_Human_SpinalCord_Neuroscience",
"Visium_FFPE_Human_Breast_Cancer",
]

if args.dry_run:
for func_name, ext in zip(all_datasets, all_extensions):
if func_name == "visium_hne_sdata":
ext = "zarr"
path = _ROOT / f"{func_name}.{ext}"
_print_message(func_name, path, dry_run=True)
logger.info("Cache: %s", DEFAULT_CACHE_DIR)
logger.info(
"Would download: %d AnnData, %d images, %d SpatialData, %d Visium",
len(registry.anndata_datasets),
len(registry.image_datasets),
len(registry.spatialdata_datasets),
len(visium_samples_to_cache),
)
return

# could be parallelized, but on CI it largely does not matter (usually limited to 2 cores + bandwidth limit)
for func_name, ext in zip(all_datasets, all_extensions):
if func_name == "visium_hne_sdata":
ext = "zarr"
path = _ROOT / f"{func_name}.{ext}"

_print_message(func_name, path)
obj = visium_hne_sdata(_ROOT)
# Download all datasets - the downloader handles caching
for name in registry.anndata_datasets:
obj = getattr(sq.datasets, name)()
assert isinstance(obj, AnnData)

assert path.is_dir(), f"Expected a .zarr folder at {path}"
continue
for name in registry.image_datasets:
obj = getattr(sq.datasets, name)()
assert isinstance(obj, sq.im.ImageContainer)

path = _ROOT / f"{func_name}.{ext}"
_print_message(func_name, path)
obj = _maybe_download_data(func_name, path)
for name in registry.spatialdata_datasets:
getattr(sq.datasets, name)()

# we could do without the AnnData check as well (1 less req. in tox.ini), but it's better to be safe
assert isinstance(obj, AnnData | sq.im.ImageContainer), type(obj)
assert path.is_file(), path
for sample in visium_samples_to_cache:
obj = sq.datasets.visium(sample, include_hires_tiff=True)
assert isinstance(obj, AnnData)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download data used for tutorials/examples.")
parser = argparse.ArgumentParser(description="Download datasets to populate CI cache.")
parser.add_argument(
"--dry-run", action="store_true", help="Do not download any data, just print what would be downloaded."
"--dry-run",
action="store_true",
help="Do not download, just print what would be downloaded.",
)

main(parser.parse_args())
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ dependencies = [
"omnipath>=1.0.7",
"pandas>=2.1",
"pillow>=8",
"pooch>=1.6",
"pyyaml>=6",
"scanpy>=1.9.3",
"scikit-image>=0.25",
# due to https://github.com/scikit-image/scikit-image/issues/6850 breaks rescale ufunc
Expand Down
166 changes: 0 additions & 166 deletions src/squidpy/datasets/_10x_datasets.py

This file was deleted.

Loading