From 5718cba18129a154985a8f1c8a16ba4d9892c2af Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 24 Jan 2025 10:34:26 +0100 Subject: [PATCH 01/62] add basic support and remote SpatialData tests --- pyproject.toml | 4 +- tests/io/test_remote.py | 23 ++++ tests/io/test_remote_mock.py | 249 +++++++++++++++++++++++++++++++++++ 3 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 tests/io/test_remote.py create mode 100644 tests/io/test_remote_mock.py diff --git a/pyproject.toml b/pyproject.toml index 0474f6f02..5bf31a040 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,13 +26,14 @@ dependencies = [ "click", "dask-image", "dask>=2024.4.1,<=2024.11.2", - "fsspec", + "fsspec[s3,http]", "geopandas>=0.14", "multiscale_spatial_image>=2.0.2", "networkx", "numba", "numpy", "ome_zarr>=0.8.4", + "universal_pathlib>=0.2.0", "pandas", "pooch", "pyarrow", @@ -71,6 +72,7 @@ test = [ "pytest-cov", "pytest-mock", "torch", + "moto[s3,server]" ] benchmark = [ "asv", diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py new file mode 100644 index 000000000..e5ee41eff --- /dev/null +++ b/tests/io/test_remote.py @@ -0,0 +1,23 @@ +import pytest +import zarr + +from spatialdata import SpatialData + + +class TestRemote: + # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html + # These tests are disabled by default because they require internet access + @pytest.fixture(params=["merfish", "mibitof"]) + def s3_address(self, request): + urls = { + "merfish": "https://s3.embl.de/spatialdata/spatialdata-sandbox/merfish.zarr/", + "mibitof": "https://s3.embl.de/spatialdata/spatialdata-sandbox/mibitof.zarr/", + } + return urls[request.param] + + # TODO: does not work, problem with opening remote parquet + @pytest.mark.xfail(reason="Problem with opening remote parquet") + def test_remote(self, s3_address): + root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") + sdata = SpatialData.read(root) + assert len(list(sdata.gen_elements())) > 0 diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py new file mode 100644 index 000000000..a7a304c55 --- /dev/null +++ b/tests/io/test_remote_mock.py @@ -0,0 +1,249 @@ +import os +import shlex +import shutil +import subprocess +import time +import uuid +from pathlib import Path + +import fsspec +import pytest +from fsspec.implementations.local import LocalFileSystem, make_path_posix +from fsspec.registry import _registry, register_implementation +from fsspec.utils import stringify_path +from upath import UPath +from upath.implementations.cloud import S3Path + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + + +## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py +def posixify(path): + return str(path).replace("\\", "/") + + +class DummyTestFS(LocalFileSystem): + protocol = "mock" + root_marker = "/" + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("mock://"): + path = path[7:] + elif path.startswith("mock:"): + path = path[5:] + return make_path_posix(path).rstrip("/") or cls.root_marker + + +@pytest.fixture(scope="session") +def clear_registry(): + register_implementation("mock", DummyTestFS) + try: + yield + finally: + _registry.clear() + + +@pytest.fixture(scope="session") +def s3_server(): + # writable local S3 system + if "BOTO_CONFIG" not in os.environ: # pragma: no cover + os.environ["BOTO_CONFIG"] = "/dev/null" + if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + if "AWS_SECRET_ACCESS_KEY" not in os.environ: # pragma: no cover + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + if "AWS_SECURITY_TOKEN" not in os.environ: # pragma: no cover + os.environ["AWS_SECURITY_TOKEN"] = "testing" + if "AWS_SESSION_TOKEN" not in os.environ: # pragma: no cover + os.environ["AWS_SESSION_TOKEN"] = "testing" + if "AWS_DEFAULT_REGION" not in os.environ: # pragma: no cover + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + requests = pytest.importorskip("requests") + + pytest.importorskip("moto") + + port = 5555 + endpoint_uri = f"http://127.0.0.1:{port}/" + proc = subprocess.Popen( + shlex.split(f"moto_server -p {port}"), + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + try: + timeout = 5 + while timeout > 0: + try: + r = requests.get(endpoint_uri, timeout=10) + if r.ok: + break + except requests.exceptions.RequestException: # pragma: no cover + pass + timeout -= 0.1 # pragma: no cover + time.sleep(0.1) # pragma: no cover + anon = False + s3so = { + "client_kwargs": {"endpoint_url": endpoint_uri}, + "use_listings_cache": True, + } + yield anon, s3so + finally: + proc.terminate() + proc.wait() + + +@pytest.fixture(scope="function") +def s3_fixture(s3_server): + pytest.importorskip("s3fs") + anon, s3so = s3_server + s3 = fsspec.filesystem("s3", anon=False, **s3so) + random_name = uuid.uuid4().hex + bucket_name = f"test_{random_name}" + if s3.exists(bucket_name): + for dir, _, keys in s3.walk(bucket_name): + for key in keys: + s3.rm(f"{dir}/{key}") + else: + s3.mkdir(bucket_name) + # for x in Path(local_testdir).glob("**/*"): + # target_path = f"{bucket_name}/{posixify(x.relative_to(local_testdir))}" + # if x.is_file(): + # s3.upload(str(x), target_path) + s3.invalidate_cache() + yield f"s3://{bucket_name}", anon, s3so + + +@pytest.fixture(scope="session") +def http_server(tmp_path_factory): + http_tempdir = tmp_path_factory.mktemp("http") + + requests = pytest.importorskip("requests") + pytest.importorskip("http.server") + proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) + try: + url = "http://127.0.0.1:8080/folder" + path = Path(http_tempdir) / "folder" + path.mkdir() + timeout = 10 + while True: + try: + r = requests.get(url, timeout=10) + if r.ok: + yield path, url + break + except requests.exceptions.RequestException as e: # noqa: E722 + timeout -= 1 + if timeout < 0: + raise SystemError from e + time.sleep(1) + finally: + proc.terminate() + proc.wait() + + +@pytest.fixture +def http_fixture(local_testdir, http_server): + http_path, http_url = http_server + shutil.rmtree(http_path) + shutil.copytree(local_testdir, http_path) + yield http_url + + +class TestRemoteMock: + @pytest.fixture(scope="function") + def upath(self, s3_fixture): + path, anon, s3so = s3_fixture + return UPath(path, anon=anon, **s3so) + + def test_is_S3Path(self, upath): + assert isinstance(upath, S3Path) + + # # Test UPath with Moto Mocking + def test_creating_file(self, upath): + file_name = "file1" + p1 = upath / file_name + p1.touch() + contents = [p.name for p in upath.iterdir()] + assert file_name in contents + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for ImageElement not yet implemented") + def test_images(self, upath: UPath, images: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + images.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(images, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for LabelsElement not yet implemented") + def test_labels(self, upath: UPath, labels: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + labels.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(labels, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for ShapesElement not yet implemented") + def test_shapes(self, upath: UPath, shapes: SpatialData) -> None: + import numpy as np + + tmpdir = upath / "tmp.zarr" + + # check the index is correctly written and then read + shapes["circles"].index = np.arange(1, len(shapes["circles"]) + 1) + + shapes.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(shapes, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for PointsElement not yet implemented") + def test_points(self, upath: UPath, points: SpatialData) -> None: + import dask.dataframe as dd + import numpy as np + + tmpdir = upath / "tmp.zarr" + + # check the index is correctly written and then read + new_index = dd.from_array(np.arange(1, len(points["points_0"]) + 1)) + points["points_0"] = points["points_0"].set_index(new_index) + + points.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(points, sdata) + + def _test_table(self, upath: UPath, table: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + table.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(table, sdata) + + def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + store = zarr.open() + table.write(tmpdir) + # location of table + + + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(elem, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") + def test_single_table_single_annotation(self, upath: UPath, table_single_annotation: SpatialData) -> None: + self._test_table(upath, table_single_annotation) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") + def test_single_table_multiple_annotations(self, upath: UPath, table_multiple_annotations: SpatialData) -> None: + self._test_table(upath, table_multiple_annotations) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for SpatialData not yet implemented") + def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + full_sdata.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(full_sdata, sdata) From a8be620c28cb6fdb4458035e3e48a5a7465be1e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:43:46 +0000 Subject: [PATCH 02/62] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/test_remote_mock.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index a7a304c55..1966e631a 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -226,7 +226,6 @@ def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: table.write(tmpdir) # location of table - sdata = SpatialData.read(tmpdir) assert_spatial_data_objects_are_identical(elem, sdata) From ce686e45b5f9e87ca468739ee22f8430a7844afa Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 10:48:26 +0100 Subject: [PATCH 03/62] fix pre-commit --- tests/io/test_remote_mock.py | 37 +++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 1966e631a..7eb76307f 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -1,6 +1,5 @@ import os import shlex -import shutil import subprocess import time import uuid @@ -17,10 +16,11 @@ from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical - ## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -def posixify(path): - return str(path).replace("\\", "/") +# TODO: check if the tests work on Windows. If yes, we can remove this commented function and the already commented +# place where it is called +# def posixify(path): +# return str(path).replace("\\", "/") class DummyTestFS(LocalFileSystem): @@ -143,12 +143,13 @@ def http_server(tmp_path_factory): proc.wait() -@pytest.fixture -def http_fixture(local_testdir, http_server): - http_path, http_url = http_server - shutil.rmtree(http_path) - shutil.copytree(local_testdir, http_path) - yield http_url +# TODO: delete this fixture if it is not used +# @pytest.fixture +# def http_fixture(local_testdir, http_server): +# http_path, http_url = http_server +# shutil.rmtree(http_path) +# shutil.copytree(local_testdir, http_path) +# yield http_url class TestRemoteMock: @@ -221,13 +222,15 @@ def _test_table(self, upath: UPath, table: SpatialData) -> None: assert_spatial_data_objects_are_identical(table, sdata) def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - store = zarr.open() - table.write(tmpdir) - # location of table - - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(elem, sdata) + pass + # TODO: fix + # tmpdir = upath / "tmp.zarr" + # store = zarr.open() + # table.write(tmpdir) + # # location of table + # + # sdata = SpatialData.read(tmpdir) + # assert_spatial_data_objects_are_identical(elem, sdata) # TODO: fix this test @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") From e7fa0206a0245d566929ef1d0f3128f6d1d02509 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 10:52:20 +0100 Subject: [PATCH 04/62] Revert "Update pyproject.toml" This reverts commit 1f8a01cf97dee96a3fa957234b2f5e8a0732d7e0. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e01ffb562..c95c25717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ maintainers = [ urls.Documentation = "https://spatialdata.scverse.org/en/latest" urls.Source = "https://github.com/scverse/spatialdata.git" urls.Home-page = "https://github.com/scverse/spatialdata.git" -requires-python = ">=3.10" +requires-python = ">=3.10, <3.13" # include 3.13 once multiscale-spatial-image conflicts are resolved dynamic= [ "version" # allow version to be set by git tags ] From a9c08011c3db6a8611f629ff28211fd7aaed238d Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 10:54:10 +0100 Subject: [PATCH 05/62] removed 3.13 from test ci --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fa5f44e10..1d7a79b9d 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python: ["3.10", "3.12", "3.13"] + python: ["3.10", "3.12"] os: [ubuntu-latest] include: - os: macos-latest From 4fe6a474394faba2ed9b9ec63734b51a44478827 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 11:20:25 +0100 Subject: [PATCH 06/62] fix --- tests/io/test_remote_mock.py | 69 +++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 7eb76307f..5ddf70500 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -3,7 +3,6 @@ import subprocess import time import uuid -from pathlib import Path import fsspec import pytest @@ -115,35 +114,35 @@ def s3_fixture(s3_server): yield f"s3://{bucket_name}", anon, s3so -@pytest.fixture(scope="session") -def http_server(tmp_path_factory): - http_tempdir = tmp_path_factory.mktemp("http") - - requests = pytest.importorskip("requests") - pytest.importorskip("http.server") - proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) - try: - url = "http://127.0.0.1:8080/folder" - path = Path(http_tempdir) / "folder" - path.mkdir() - timeout = 10 - while True: - try: - r = requests.get(url, timeout=10) - if r.ok: - yield path, url - break - except requests.exceptions.RequestException as e: # noqa: E722 - timeout -= 1 - if timeout < 0: - raise SystemError from e - time.sleep(1) - finally: - proc.terminate() - proc.wait() +# TODO: delete these 2 fixtures since they are not used +# @pytest.fixture(scope="session") +# def http_server(tmp_path_factory): +# http_tempdir = tmp_path_factory.mktemp("http") +# +# requests = pytest.importorskip("requests") +# pytest.importorskip("http.server") +# proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) +# try: +# url = "http://127.0.0.1:8080/folder" +# path = Path(http_tempdir) / "folder" +# path.mkdir() +# timeout = 10 +# while True: +# try: +# r = requests.get(url, timeout=10) +# if r.ok: +# yield path, url +# break +# except requests.exceptions.RequestException as e: # noqa: E722 +# timeout -= 1 +# if timeout < 0: +# raise SystemError from e +# time.sleep(1) +# finally: +# proc.terminate() +# proc.wait() -# TODO: delete this fixture if it is not used # @pytest.fixture # def http_fixture(local_testdir, http_server): # http_path, http_url = http_server @@ -249,3 +248,17 @@ def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: full_sdata.write(tmpdir) sdata = SpatialData.read(tmpdir) assert_spatial_data_objects_are_identical(full_sdata, sdata) + + def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: + pass + tmpdir = upath / "tmp.zarr" + images.write(tmpdir) + # with tempfile.TemporaryDirectory() as local_tmpdir: + # pass + # # images.write(local_tmpdir) + # # local_sdata = SpatialData.read(local_tmpdir) + # + # pass + # + # # sdata = SpatialData.read(tmpdir) + # # assert_spatial_data_objects_are_identical(local_sdata, sdata) From d97a1d2943b960c70b86e15e65b80132dd962b6a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 12:27:46 +0100 Subject: [PATCH 07/62] uploading sdata to local s3 storage --- tests/io/test_remote_mock.py | 54 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 5ddf70500..30b3a5feb 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -1,8 +1,10 @@ import os import shlex import subprocess +import tempfile import time import uuid +from pathlib import Path import fsspec import pytest @@ -15,11 +17,10 @@ from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical + ## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -# TODO: check if the tests work on Windows. If yes, we can remove this commented function and the already commented -# place where it is called -# def posixify(path): -# return str(path).replace("\\", "/") +def posixify(path): + return str(path).replace("\\", "/") class DummyTestFS(LocalFileSystem): @@ -94,7 +95,7 @@ def s3_server(): @pytest.fixture(scope="function") -def s3_fixture(s3_server): +def s3_fixture(s3_server, full_sdata): pytest.importorskip("s3fs") anon, s3so = s3_server s3 = fsspec.filesystem("s3", anon=False, **s3so) @@ -106,10 +107,16 @@ def s3_fixture(s3_server): s3.rm(f"{dir}/{key}") else: s3.mkdir(bucket_name) - # for x in Path(local_testdir).glob("**/*"): - # target_path = f"{bucket_name}/{posixify(x.relative_to(local_testdir))}" - # if x.is_file(): - # s3.upload(str(x), target_path) + + # write a full spatialdata object to the bucket + with tempfile.TemporaryDirectory() as tempdir: + sdata_path = Path(tempdir) / "full_sdata.zarr" + full_sdata.write(sdata_path) + for x in sdata_path.glob("**/*"): + target_path = f"{bucket_name}/full_sdata.zarr/{posixify(x.relative_to(sdata_path))}" + if x.is_file(): + s3.upload(str(x), target_path) + s3.invalidate_cache() yield f"s3://{bucket_name}", anon, s3so @@ -161,7 +168,7 @@ def test_is_S3Path(self, upath): assert isinstance(upath, S3Path) # # Test UPath with Moto Mocking - def test_creating_file(self, upath): + def test_creating_file(self, upath: UPath) -> None: file_name = "file1" p1 = upath / file_name p1.touch() @@ -250,15 +257,18 @@ def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: assert_spatial_data_objects_are_identical(full_sdata, sdata) def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: - pass - tmpdir = upath / "tmp.zarr" - images.write(tmpdir) - # with tempfile.TemporaryDirectory() as local_tmpdir: - # pass - # # images.write(local_tmpdir) - # # local_sdata = SpatialData.read(local_tmpdir) - # - # pass - # - # # sdata = SpatialData.read(tmpdir) - # # assert_spatial_data_objects_are_identical(local_sdata, sdata) + with tempfile.TemporaryDirectory() as tmpdir: + sdata_path = Path(tmpdir) / "full_sdata.zarr" + images.write(sdata_path) + local_sdata = SpatialData.read(sdata_path) # noqa: F841 + remote_path = upath / "full_sdata.zarr" # noqa: F841 + + # TODO: read a single remote image from the S3 data and add it to the local SpatialData object + # for a in remote_path.glob('**/*'): + # print(a) + # + # import zarr.storage + # from zarr.storage import FSStore + # store = zarr.storage.FSStore(fs=upath.fs, url=remote_path) + # list(store.keys()) + # store.close() From 5e26b5e5467c1014120d46ba0e8d0897c4248ef3 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 14:28:04 +0100 Subject: [PATCH 08/62] add _open_zarr_store --- src/spatialdata/_core/spatialdata.py | 29 ++++++++++++---------------- src/spatialdata/_io/_utils.py | 17 ++++++++++++---- tests/io/test_remote_mock.py | 3 +++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index c4cce2142..bd840847c 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -31,10 +31,7 @@ ) from spatialdata._logging import logger from spatialdata._types import ArrayLike, Raster_T -from spatialdata._utils import ( - _deprecation_alias, - _error_message_add_element, -) +from spatialdata._utils import _deprecation_alias, _error_message_add_element from spatialdata.models import ( Image2DModel, Image3DModel, @@ -600,7 +597,7 @@ def path(self, value: Path | None) -> None: ) def _get_groups_for_element( - self, zarr_path: Path, element_type: str, element_name: str + self, zarr_path: UPath, element_type: str, element_name: str ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -620,9 +617,9 @@ def _get_groups_for_element( ------- either the existing Zarr subgroup or a new one. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") - store = parse_url(zarr_path, mode="r+").store + from spatialdata._io._utils import _open_zarr_store + + store = _open_zarr_store(zarr_path, mode="r+") root = zarr.group(store=store) if element_type not in ["images", "labels", "points", "polygons", "shapes", "tables"]: raise ValueError(f"Unknown element type {element_type}") @@ -1375,7 +1372,7 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: self.delete_element_from_disk(name) return - from spatialdata._io._utils import _backed_elements_contained_in_path + from spatialdata._io._utils import _backed_elements_contained_in_path, _open_zarr_store if self.path is None: raise ValueError("The SpatialData object is not backed by a Zarr store.") @@ -1416,7 +1413,7 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: ) # delete the element - store = parse_url(self.path, mode="r+").store + store = _open_zarr_store(self.path) root = zarr.group(store=store) root[element_type].pop(element_name) store.close() @@ -1437,7 +1434,9 @@ def _check_element_not_on_disk_with_different_type(self, element_type: str, elem ) def write_consolidated_metadata(self) -> None: - store = parse_url(self.path, mode="r+").store + from spatialdata._io._utils import _open_zarr_store + + store = _open_zarr_store(self.path) # consolidate metadata to more easily support remote reading bug in zarr. In reality, 'zmetadata' is written # instead of '.zmetadata' see discussion https://github.com/zarr-developers/zarr-python/issues/1121 zarr.consolidate_metadata(store, metadata_key=".zmetadata") @@ -1574,15 +1573,11 @@ def write_transformations(self, element_name: str | None = None) -> None: ) axes = get_axes_names(element) if isinstance(element, DataArray | DataTree): - from spatialdata._io._utils import ( - overwrite_coordinate_transformations_raster, - ) + from spatialdata._io._utils import overwrite_coordinate_transformations_raster overwrite_coordinate_transformations_raster(group=element_group, axes=axes, transformations=transformations) elif isinstance(element, DaskDataFrame | GeoDataFrame | AnnData): - from spatialdata._io._utils import ( - overwrite_coordinate_transformations_non_raster, - ) + from spatialdata._io._utils import overwrite_coordinate_transformations_non_raster overwrite_coordinate_transformations_non_raster( group=element_group, axes=axes, transformations=transformations diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 719472612..dbe2573c9 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -17,7 +17,10 @@ from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame +from upath import UPath +from upath.implementations.local import PosixUPath, WindowsUPath from xarray import DataArray, DataTree +from zarr.storage import FSStore from spatialdata._core.spatialdata import SpatialData from spatialdata._utils import get_pyramid_levels @@ -28,10 +31,7 @@ _validate_mapping_to_coordinate_system_type, ) from spatialdata.transformations.ngff.ngff_transformations import NgffBaseTransformation -from spatialdata.transformations.transformations import ( - BaseTransformation, - _get_current_output_axes, -) +from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes # suppress logger debug from ome_zarr with context manager @@ -383,3 +383,12 @@ def save_transformations(sdata: SpatialData) -> None: stacklevel=2, ) sdata.write_transformations() + + +def _open_zarr_store(path: str | UPath, **kwargs) -> zarr.storage.BaseStore: + if isinstance(path, str | Path): + path = UPath(path) + if isinstance(path, PosixUPath | WindowsUPath): + return zarr.storage.DirectoryStore(path.path) + else: + return FSStore(path.path, fs=path.fs, **kwargs) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 30b3a5feb..208ec5cfc 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -263,6 +263,9 @@ def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> No local_sdata = SpatialData.read(sdata_path) # noqa: F841 remote_path = upath / "full_sdata.zarr" # noqa: F841 + remote_sdata = SpatialData.read(remote_path) + assert_spatial_data_objects_are_identical(local_sdata, remote_sdata) + # TODO: read a single remote image from the S3 data and add it to the local SpatialData object # for a in remote_path.glob('**/*'): # print(a) From 57948712b77cc795b275c91a89eb858c2920fe01 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:11:00 +0100 Subject: [PATCH 09/62] revert changing write function signature Co-authored-by: LucaMarconato --- src/spatialdata/_core/spatialdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index bd840847c..7ad0dd9ad 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -597,7 +597,7 @@ def path(self, value: Path | None) -> None: ) def _get_groups_for_element( - self, zarr_path: UPath, element_type: str, element_name: str + self, zarr_path: Path, element_type: str, element_name: str ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. From 0207ff722373ceffb4d920cbd23576477f96841b Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:16:16 +0100 Subject: [PATCH 10/62] update _open_zarr_store with StoreLike --- src/spatialdata/_io/_utils.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index dbe2573c9..f7e6f0466 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -12,7 +12,7 @@ from pathlib import Path from typing import Any -import zarr +import zarr.storage from anndata import AnnData from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame @@ -385,10 +385,20 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -def _open_zarr_store(path: str | UPath, **kwargs) -> zarr.storage.BaseStore: +type StoreLike = str | Path | UPath | zarr.storage.StoreLike + + +def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: if isinstance(path, str | Path): + # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): + # if the input is a local path, use DirectoryStore return zarr.storage.DirectoryStore(path.path) - else: + if isinstance(path, zarr.storage.StoreLike): + # if the input already a store, wrap it in an FSStore + return FSStore(path, **kwargs) + if isinstance(path, UPath): + # if input is a remote UPath, map it to an FSStore return FSStore(path.path, fs=path.fs, **kwargs) + raise TypeError(f"Unsupported type: {type(path)}") From 7e497ff04fa7d74bcaf7becb24df6f787b7078db Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:25:17 +0100 Subject: [PATCH 11/62] read image element from base store --- src/spatialdata/_io/io_raster.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 541be3ead..a7b06e26c 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -1,9 +1,9 @@ -from pathlib import Path from typing import Any, Literal import dask.array as da import numpy as np import zarr +import zarr.storage from ome_zarr.format import Format from ome_zarr.io import ZarrLocation from ome_zarr.reader import Label, Multiscales, Node, Reader @@ -15,16 +15,8 @@ from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff from xarray import DataArray, Dataset, DataTree -from spatialdata._io._utils import ( - _get_transformations_from_ngff_dict, - overwrite_coordinate_transformations_raster, -) -from spatialdata._io.format import ( - CurrentRasterFormat, - RasterFormats, - RasterFormatV01, - _parse_version, -) +from spatialdata._io._utils import _get_transformations_from_ngff_dict, overwrite_coordinate_transformations_raster +from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -36,16 +28,16 @@ ) -def _read_multiscale(store: str | Path, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: - assert isinstance(store, str | Path) +def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: + assert isinstance(store, zarr.storage.BaseStore) assert raster_type in ["image", "labels"] - f = zarr.open(store, mode="r") - version = _parse_version(f, expect_attrs_key=True) + group = zarr.group(store=store) + version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, # interpreting the lack of such information as the presence of the format v01 format = RasterFormatV01() if version is None else RasterFormats[version] - f.store.close() + store.close() nodes: list[Node] = [] image_loc = ZarrLocation(store) From c674281bdbbb206abaa3fd1e5fe728bff78b5a75 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:45:06 +0100 Subject: [PATCH 12/62] clean up remote mock tests, focus only on reading raster elements --- tests/io/test_remote_mock.py | 265 +++++++++++------------------------ 1 file changed, 83 insertions(+), 182 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 208ec5cfc..266c34973 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -8,47 +8,18 @@ import fsspec import pytest -from fsspec.implementations.local import LocalFileSystem, make_path_posix -from fsspec.registry import _registry, register_implementation -from fsspec.utils import stringify_path from upath import UPath from upath.implementations.cloud import S3Path from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical - -## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -def posixify(path): - return str(path).replace("\\", "/") - - -class DummyTestFS(LocalFileSystem): - protocol = "mock" - root_marker = "/" - - @classmethod - def _strip_protocol(cls, path): - path = stringify_path(path) - if path.startswith("mock://"): - path = path[7:] - elif path.startswith("mock:"): - path = path[5:] - return make_path_posix(path).rstrip("/") or cls.root_marker - - -@pytest.fixture(scope="session") -def clear_registry(): - register_implementation("mock", DummyTestFS) - try: - yield - finally: - _registry.clear() +# This mock setup was inspired by https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py @pytest.fixture(scope="session") def s3_server(): - # writable local S3 system + # create a writable local S3 system via moto if "BOTO_CONFIG" not in os.environ: # pragma: no cover os.environ["BOTO_CONFIG"] = "/dev/null" if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover @@ -94,184 +65,114 @@ def s3_server(): proc.wait() -@pytest.fixture(scope="function") -def s3_fixture(s3_server, full_sdata): - pytest.importorskip("s3fs") +def clear_s3(s3_server, location=None): + # clear an s3 bucket of all contents anon, s3so = s3_server - s3 = fsspec.filesystem("s3", anon=False, **s3so) - random_name = uuid.uuid4().hex - bucket_name = f"test_{random_name}" - if s3.exists(bucket_name): - for dir, _, keys in s3.walk(bucket_name): + s3 = fsspec.filesystem("s3", anon=anon, **s3so) + if location and s3.exists(location): + for d, _, keys in s3.walk(location): for key in keys: - s3.rm(f"{dir}/{key}") - else: - s3.mkdir(bucket_name) + s3.rm(f"{d}/{key}") + s3.invalidate_cache() + - # write a full spatialdata object to the bucket +def upload_to_upath(upath, sdata): + # write the object to disk via a regular path, then copy it to the UPath byte-by-byte + # useful for testing the read and write functionality separately with tempfile.TemporaryDirectory() as tempdir: - sdata_path = Path(tempdir) / "full_sdata.zarr" - full_sdata.write(sdata_path) + sdata_path = Path(tempdir) / "temp.zarr" + sdata.write(sdata_path) + # for every file in the sdata_path, copy it to the upath for x in sdata_path.glob("**/*"): - target_path = f"{bucket_name}/full_sdata.zarr/{posixify(x.relative_to(sdata_path))}" if x.is_file(): - s3.upload(str(x), target_path) + data = x.read_bytes() + destination = upath / x.relative_to(sdata_path) + destination.write_bytes(data) + +@pytest.fixture(scope="function") +def s3_fixture(s3_server): + # make a mock bucket available for testing + pytest.importorskip("s3fs") + anon, s3so = s3_server + s3 = fsspec.filesystem("s3", anon=anon, **s3so) + random_name = uuid.uuid4().hex + bucket_name = f"test_{random_name}" + clear_s3(s3_server, bucket_name) + s3.mkdir(bucket_name) + # here you could write existing test files to s3.upload if needed s3.invalidate_cache() yield f"s3://{bucket_name}", anon, s3so -# TODO: delete these 2 fixtures since they are not used -# @pytest.fixture(scope="session") -# def http_server(tmp_path_factory): -# http_tempdir = tmp_path_factory.mktemp("http") -# -# requests = pytest.importorskip("requests") -# pytest.importorskip("http.server") -# proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) -# try: -# url = "http://127.0.0.1:8080/folder" -# path = Path(http_tempdir) / "folder" -# path.mkdir() -# timeout = 10 -# while True: -# try: -# r = requests.get(url, timeout=10) -# if r.ok: -# yield path, url -# break -# except requests.exceptions.RequestException as e: # noqa: E722 -# timeout -= 1 -# if timeout < 0: -# raise SystemError from e -# time.sleep(1) -# finally: -# proc.terminate() -# proc.wait() - - -# @pytest.fixture -# def http_fixture(local_testdir, http_server): -# http_path, http_url = http_server -# shutil.rmtree(http_path) -# shutil.copytree(local_testdir, http_path) -# yield http_url - - class TestRemoteMock: @pytest.fixture(scope="function") def upath(self, s3_fixture): + # create a UPath object for the mock s3 bucket path, anon, s3so = s3_fixture return UPath(path, anon=anon, **s3so) def test_is_S3Path(self, upath): assert isinstance(upath, S3Path) - # # Test UPath with Moto Mocking - def test_creating_file(self, upath: UPath) -> None: + def test_upload_sdata(self, upath, full_sdata): + tmpdir = upath / "tmp.zarr" + upload_to_upath(tmpdir, full_sdata) + assert tmpdir.exists() + assert len(list(tmpdir.glob("*"))) == 8 + + def test_creating_file(self, upath) -> None: file_name = "file1" p1 = upath / file_name p1.touch() contents = [p.name for p in upath.iterdir()] assert file_name in contents - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for ImageElement not yet implemented") - def test_images(self, upath: UPath, images: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - images.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(images, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for LabelsElement not yet implemented") - def test_labels(self, upath: UPath, labels: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - labels.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(labels, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for ShapesElement not yet implemented") - def test_shapes(self, upath: UPath, shapes: SpatialData) -> None: - import numpy as np - - tmpdir = upath / "tmp.zarr" - - # check the index is correctly written and then read - shapes["circles"].index = np.arange(1, len(shapes["circles"]) + 1) - - shapes.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(shapes, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for PointsElement not yet implemented") - def test_points(self, upath: UPath, points: SpatialData) -> None: - import dask.dataframe as dd - import numpy as np - - tmpdir = upath / "tmp.zarr" - - # check the index is correctly written and then read - new_index = dd.from_array(np.arange(1, len(points["points_0"]) + 1)) - points["points_0"] = points["points_0"].set_index(new_index) - - points.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(points, sdata) - - def _test_table(self, upath: UPath, table: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - table.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(table, sdata) - - def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: - pass - # TODO: fix - # tmpdir = upath / "tmp.zarr" - # store = zarr.open() - # table.write(tmpdir) - # # location of table - # - # sdata = SpatialData.read(tmpdir) - # assert_spatial_data_objects_are_identical(elem, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") - def test_single_table_single_annotation(self, upath: UPath, table_single_annotation: SpatialData) -> None: - self._test_table(upath, table_single_annotation) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") - def test_single_table_multiple_annotations(self, upath: UPath, table_multiple_annotations: SpatialData) -> None: - self._test_table(upath, table_multiple_annotations) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for SpatialData not yet implemented") - def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - full_sdata.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(full_sdata, sdata) - - def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: + @pytest.mark.parametrize( + "sdata_type", + [ + "images", + "labels", + # TODO: fix remote reading support points + # "points", + # TODO: fix remote reading support shapes + # "shapes", + ], + ) + def test_reading_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: + sdata = request.getfixturevalue(sdata_type) with tempfile.TemporaryDirectory() as tmpdir: - sdata_path = Path(tmpdir) / "full_sdata.zarr" - images.write(sdata_path) - local_sdata = SpatialData.read(sdata_path) # noqa: F841 - remote_path = upath / "full_sdata.zarr" # noqa: F841 - + local_path = Path(tmpdir) / "tmp.zarr" + sdata.write(local_path) + local_sdata = SpatialData.read(local_path) + local_len = len(list(local_sdata.gen_elements())) + assert local_len > 0 + remote_path = upath / "tmp.zarr" + upload_to_upath(remote_path, sdata) remote_sdata = SpatialData.read(remote_path) + assert len(list(remote_sdata.gen_elements())) == local_len assert_spatial_data_objects_are_identical(local_sdata, remote_sdata) - # TODO: read a single remote image from the S3 data and add it to the local SpatialData object - # for a in remote_path.glob('**/*'): - # print(a) - # - # import zarr.storage - # from zarr.storage import FSStore - # store = zarr.storage.FSStore(fs=upath.fs, url=remote_path) - # list(store.keys()) - # store.close() + @pytest.mark.parametrize( + "sdata_type", + [ + # TODO: fix remote writing support images + # "images", + # TODO: fix remote writing support labels + # "labels", + # TODO: fix remote writing support points + # "points", + # TODO: fix remote writing support shapes + # "shapes", + ], + ) + def test_writing_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: + sdata = request.getfixturevalue(sdata_type) + n_elements = len(list(sdata.gen_elements())) + # test writing to a remote path + remote_path = upath / "tmp.zarr" + sdata.write(remote_path) + assert len(upath.glob("*")) == n_elements + # test reading the remotely written object + remote_sdata = SpatialData.read(remote_path) + assert_spatial_data_objects_are_identical(sdata, remote_sdata) From fb953a0908b863f0f7bfa5e89985a85760ab3b95 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:45:34 +0100 Subject: [PATCH 13/62] improve remote http test, add alternative --- tests/io/test_remote.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py index e5ee41eff..dd47d464f 100644 --- a/tests/io/test_remote.py +++ b/tests/io/test_remote.py @@ -7,17 +7,19 @@ class TestRemote: # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html # These tests are disabled by default because they require internet access - @pytest.fixture(params=["merfish", "mibitof"]) + @pytest.fixture(params=["merfish", "mibitof", "mibitof_alt"]) def s3_address(self, request): urls = { "merfish": "https://s3.embl.de/spatialdata/spatialdata-sandbox/merfish.zarr/", "mibitof": "https://s3.embl.de/spatialdata/spatialdata-sandbox/mibitof.zarr/", + "mibitof_alt": "https://dl01.irc.ugent.be/spatial/mibitof/data.zarr/", } return urls[request.param] - # TODO: does not work, problem with opening remote parquet - @pytest.mark.xfail(reason="Problem with opening remote parquet") + # TODO: does not work, problem with opening version 0.6-dev + @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote(self, s3_address): root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") - sdata = SpatialData.read(root) + # TODO: remove selection once support for points, shapes and tables is added + sdata = SpatialData.read(root, selection=("images", "labels")) assert len(list(sdata.gen_elements())) > 0 From 52bb5fcb41b83f8d38272b9fcc1095181ee60c63 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:46:07 +0100 Subject: [PATCH 14/62] add support for consolidated metadata store in util function, add _create_upath function --- src/spatialdata/_io/_utils.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index f7e6f0466..4faf085a2 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -10,7 +10,7 @@ from contextlib import contextmanager from functools import singledispatch from pathlib import Path -from typing import Any +from typing import Any, TypeAlias import zarr.storage from anndata import AnnData @@ -385,16 +385,30 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -type StoreLike = str | Path | UPath | zarr.storage.StoreLike +StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: + # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use DirectoryStore return zarr.storage.DirectoryStore(path.path) + if isinstance(path, zarr.Group): + # if the input is a zarr.Group, wrap it with a store + if isinstance(path.store, zarr.storage.DirectoryStore): + # create a simple FSStore if the store is a DirectoryStore with just the path + return FSStore(os.path.join(path.store.path, path.path), **kwargs) + if isinstance(path.store, FSStore): + # if the store within the zarr.Group is an FSStore, return it + # but extend the path of the store with that of the zarr.Group + return FSStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) + if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # if the store is a ConsolidatedMetadataStore, just return it + return path.store + raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore return FSStore(path, **kwargs) @@ -402,3 +416,14 @@ def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: # if input is a remote UPath, map it to an FSStore return FSStore(path.path, fs=path.fs, **kwargs) raise TypeError(f"Unsupported type: {type(path)}") + + +def _create_upath(path: StoreLike) -> UPath: + # try to create a UPath from the input + if isinstance(path, str | Path): + return Path(path) + if hasattr(path, "store") and isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # create a url from the ConsolidatedMetadataStore and append it with the path from the Group StoreLike object + return UPath(path.store.store.path) / path.path + # best effort to create a UPath + return UPath(path) From ca824937ffdf0265f45091f4d6b64f061b12214e Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:56:13 +0100 Subject: [PATCH 15/62] allow for groups as store input --- src/spatialdata/_io/io_points.py | 4 +--- src/spatialdata/_io/io_raster.py | 15 +++++++++++---- src/spatialdata/_io/io_shapes.py | 16 +++------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 3106c8470..2c3193a41 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -24,9 +24,7 @@ def _read_points( store: str | Path | MutableMapping | zarr.Group, # type: ignore[type-arg] ) -> DaskDataFrame: """Read points from a zarr store.""" - assert isinstance(store, str | Path) - f = zarr.open(store, mode="r") - + f = zarr.open(store, mode="r") if isinstance(store, str | Path | MutableMapping) else store version = _parse_version(f, expect_attrs_key=True) assert version is not None format = PointsFormats[version] diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index a7b06e26c..345619d37 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -13,9 +13,14 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff +from upath import UPath from xarray import DataArray, Dataset, DataTree -from spatialdata._io._utils import _get_transformations_from_ngff_dict, overwrite_coordinate_transformations_raster +from spatialdata._io._utils import ( + _get_transformations_from_ngff_dict, + _open_zarr_store, + overwrite_coordinate_transformations_raster, +) from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names @@ -28,10 +33,12 @@ ) -def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: - assert isinstance(store, zarr.storage.BaseStore) +def _read_multiscale( + store: str | UPath | zarr.storage.BaseStore, raster_type: Literal["image", "labels"] +) -> DataArray | DataTree: assert raster_type in ["image", "labels"] - + if isinstance(store, str | UPath): + store = _open_zarr_store(store) group = zarr.group(store=store) version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index c32ce1f34..a7d52a51c 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -12,26 +12,16 @@ _write_metadata, overwrite_coordinate_transformations_non_raster, ) -from spatialdata._io.format import ( - CurrentShapesFormat, - ShapesFormats, - ShapesFormatV01, - ShapesFormatV02, - _parse_version, -) +from spatialdata._io.format import CurrentShapesFormat, ShapesFormats, ShapesFormatV01, ShapesFormatV02, _parse_version from spatialdata.models import ShapesModel, get_axes_names -from spatialdata.transformations._utils import ( - _get_transformations, - _set_transformations, -) +from spatialdata.transformations._utils import _get_transformations, _set_transformations def _read_shapes( store: str | Path | MutableMapping | zarr.Group, # type: ignore[type-arg] ) -> GeoDataFrame: """Read shapes from a zarr store.""" - assert isinstance(store, str | Path) - f = zarr.open(store, mode="r") + f = zarr.open(store, mode="r") if isinstance(store, str | Path | MutableMapping) else store version = _parse_version(f, expect_attrs_key=True) assert version is not None format = ShapesFormats[version] From ecea0e62a3ac6f25499cbd8eca08577b92d05958 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:10:50 +0100 Subject: [PATCH 16/62] handle consolidated metadata with upath --- src/spatialdata/_io/_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 4faf085a2..eae5aef67 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -406,8 +406,12 @@ def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: # but extend the path of the store with that of the zarr.Group return FSStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # TODO: find a way to check if the consolidated metadata is still used. Probably best to wait for Zarr v3. # if the store is a ConsolidatedMetadataStore, just return it - return path.store + # get the FSStore url path from store and append it with the path from the Group StoreLike object + url = UPath(path.store.store.path + path.path) + # same as UPath option + return FSStore(url.path, fs=url.fs, **kwargs) raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore From 734eb45a2a43b5f22b9f951a59e936474cb39902 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:11:18 +0100 Subject: [PATCH 17/62] split remote reading tests between http and http with consolidated metadata --- tests/io/test_remote.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py index dd47d464f..4611a5587 100644 --- a/tests/io/test_remote.py +++ b/tests/io/test_remote.py @@ -6,7 +6,7 @@ class TestRemote: # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html - # These tests are disabled by default because they require internet access + # TODO: mark these tests so they are disabled by default because they require internet access @pytest.fixture(params=["merfish", "mibitof", "mibitof_alt"]) def s3_address(self, request): urls = { @@ -16,9 +16,16 @@ def s3_address(self, request): } return urls[request.param] - # TODO: does not work, problem with opening version 0.6-dev + # TODO: does not work for EMBL datasets, problem with opening version 0.6-dev @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote(self, s3_address): + # TODO: remove selection once support for points, shapes and tables is added + sdata = SpatialData.read(s3_address, selection=("images", "labels")) + assert len(list(sdata.gen_elements())) > 0 + + @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") + def test_remote_consolidated(self, s3_address): + # TODO: find a way to check if the consolidated metadata is actually used. Probably best to wait for Zarr v3. root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") # TODO: remove selection once support for points, shapes and tables is added sdata = SpatialData.read(root, selection=("images", "labels")) From c0ffb1c13118f23d3e14b7aa4d6510cb07e2ee49 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:14:13 +0100 Subject: [PATCH 18/62] remove f_store_path, support remote raster types fully and keep local support for other elements --- src/spatialdata/_io/io_zarr.py | 100 ++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 0be7c8f4f..a9d2cf986 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,13 +1,15 @@ import logging -import os import warnings -from pathlib import Path import zarr +import zarr.storage from anndata import AnnData +from dask.dataframe import DataFrame as DaskDataFrame +from geopandas import GeoDataFrame +from xarray import DataArray, DataTree from spatialdata._core.spatialdata import SpatialData -from spatialdata._io._utils import ome_zarr_logger +from spatialdata._io._utils import StoreLike, _create_upath, _open_zarr_store, ome_zarr_logger from spatialdata._io.io_points import _read_points from spatialdata._io.io_raster import _read_multiscale from spatialdata._io.io_shapes import _read_shapes @@ -15,28 +17,51 @@ from spatialdata._logging import logger -def _open_zarr_store(store: str | Path | zarr.Group) -> tuple[zarr.Group, str]: +def read_image_element(path: StoreLike) -> DataArray | DataTree: + """Read a single image element from a store location. + + Parameters + ---------- + path + Path to the zarr store. + + Returns + ------- + A DataArray or DataTree object. """ - Open a zarr store (on-disk or remote) and return the zarr.Group object and the path to the store. + store = _open_zarr_store(path) + return _read_multiscale(store, raster_type="image") + + +def read_labels_element(path: StoreLike) -> DataArray | DataTree: + """Read a single image element from a store location. Parameters ---------- - store - Path to the zarr store (on-disk or remote) or a zarr.Group object. + path + Path to the zarr store. Returns ------- - A tuple of the zarr.Group object and the path to the store. + A DataArray or DataTree object. """ - f = store if isinstance(store, zarr.Group) else zarr.open(store, mode="r") - # workaround: .zmetadata is being written as zmetadata (https://github.com/zarr-developers/zarr-python/issues/1121) - if isinstance(store, str | Path) and str(store).startswith("http") and len(f) == 0: - f = zarr.open_consolidated(store, mode="r", metadata_key="zmetadata") - f_store_path = f.store.store.path if isinstance(f.store, zarr.storage.ConsolidatedMetadataStore) else f.store.path - return f, f_store_path + store = _open_zarr_store(path) + return _read_multiscale(store, raster_type="labels") + + +def read_points_element() -> DaskDataFrame: + pass -def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = None) -> SpatialData: +def read_shapes_element() -> GeoDataFrame: + pass + + +def read_table_element() -> AnnData: + pass + + +def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> SpatialData: """ Read a SpatialData dataset from a zarr store (on-disk or remote). @@ -53,7 +78,10 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non ------- A SpatialData object. """ - f, f_store_path = _open_zarr_store(store) + store = _open_zarr_store(store_like) + f = zarr.group(store) + # TODO: remove table once deprecated. + selector = {"images", "labels", "points", "shapes", "tables", "table"} if not selection else set(selection or []) images = {} labels = {} @@ -61,21 +89,18 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non tables: dict[str, AnnData] = {} shapes = {} - # TODO: remove table once deprecated. - selector = {"images", "labels", "points", "shapes", "tables", "table"} if not selection else set(selection or []) logger.debug(f"Reading selection {selector}") # read multiscale images if "images" in selector and "images" in f: - group = f["images"] + group = f.images count = 0 for subgroup_name in group: - if Path(subgroup_name).name.startswith("."): + if subgroup_name.startswith("."): # skip hidden files like .zgroup or .zmetadata continue f_elem = group[subgroup_name] - f_elem_store = os.path.join(f_store_path, f_elem.path) - element = _read_multiscale(f_elem_store, raster_type="image") + element = read_image_element(f_elem) images[subgroup_name] = element count += 1 logger.debug(f"Found {count} elements in {group}") @@ -83,15 +108,14 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non # read multiscale labels with ome_zarr_logger(logging.ERROR): if "labels" in selector and "labels" in f: - group = f["labels"] + group = f.labels count = 0 for subgroup_name in group: - if Path(subgroup_name).name.startswith("."): + if subgroup_name.startswith("."): # skip hidden files like .zgroup or .zmetadata continue f_elem = group[subgroup_name] - f_elem_store = os.path.join(f_store_path, f_elem.path) - labels[subgroup_name] = _read_multiscale(f_elem_store, raster_type="labels") + labels[subgroup_name] = read_labels_element(f_elem) count += 1 logger.debug(f"Found {count} elements in {group}") @@ -100,12 +124,11 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non group = f["points"] count = 0 for subgroup_name in group: - f_elem = group[subgroup_name] - if Path(subgroup_name).name.startswith("."): + if subgroup_name.startswith("."): # skip hidden files like .zgroup or .zmetadata continue - f_elem_store = os.path.join(f_store_path, f_elem.path) - points[subgroup_name] = _read_points(f_elem_store) + f_elem = group[subgroup_name] + points[subgroup_name] = _read_points(f_elem) count += 1 logger.debug(f"Found {count} elements in {group}") @@ -113,30 +136,26 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non group = f["shapes"] count = 0 for subgroup_name in group: - if Path(subgroup_name).name.startswith("."): + if subgroup_name.startswith("."): # skip hidden files like .zgroup or .zmetadata continue f_elem = group[subgroup_name] - f_elem_store = os.path.join(f_store_path, f_elem.path) - shapes[subgroup_name] = _read_shapes(f_elem_store) + shapes[subgroup_name] = _read_shapes(f_elem) count += 1 logger.debug(f"Found {count} elements in {group}") if "tables" in selector and "tables" in f: group = f["tables"] - tables = _read_table(f_store_path, f, group, tables) + tables = _read_table(zarr_store_path=str(store_like), group=f, subgroup=group, tables=tables) if "table" in selector and "table" in f: warnings.warn( - f"Table group found in zarr store at location {f_store_path}. Please update the zarr store to use tables " - f"instead.", + f"Table group found in zarr store at location {f}. Please update the zarr store to use tables instead.", DeprecationWarning, stacklevel=2, ) subgroup_name = "table" group = f[subgroup_name] - tables = _read_table(f_store_path, f, group, tables) - - logger.debug(f"Found {count} elements in {group}") + tables = _read_table(zarr_store_path=store_like, group=f, subgroup=group, tables=tables) # read attrs metadata attrs = f.attrs.asdict() @@ -155,5 +174,6 @@ def read_zarr(store: str | Path | zarr.Group, selection: None | tuple[str] = Non tables=tables, attrs=attrs, ) - sdata.path = Path(store) + # TODO: create a UPath object from any StoreLike object + sdata.path = _create_upath(store_like) return sdata From d60bd8529552af2eb05e4c75322b4938a9689aec Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:35:36 +0100 Subject: [PATCH 19/62] Fix metadata_key bug now that store is not always FSStore. Add extra comments. --- src/spatialdata/_core/spatialdata.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 7ad0dd9ad..eedc50315 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1437,14 +1437,21 @@ def write_consolidated_metadata(self) -> None: from spatialdata._io._utils import _open_zarr_store store = _open_zarr_store(self.path) - # consolidate metadata to more easily support remote reading bug in zarr. In reality, 'zmetadata' is written - # instead of '.zmetadata' see discussion https://github.com/zarr-developers/zarr-python/issues/1121 - zarr.consolidate_metadata(store, metadata_key=".zmetadata") + # Note that the store can be local (which does not have the zmetadata bug) + # or a remote FSStore (which has the bug). + # Consolidate metadata to more easily support remote reading bug in zarr. + # We write 'zmetadata' instead of the standard '.zmetadata' to avoid the FSStore bug. + # See discussion https://github.com/zarr-developers/zarr-python/issues/1121 + zarr.consolidate_metadata(store, metadata_key="zmetadata") store.close() def has_consolidated_metadata(self) -> bool: + from spatialdata._io._utils import _open_zarr_store + return_value = False - store = parse_url(self.path, mode="r").store + store = _open_zarr_store(self.path) + # Note that the store can be local (which does not have the zmetadata bug) + # or a remote FSStore (which has the bug). if "zmetadata" in store: return_value = True store.close() From c3fa8cf0b1caf3a418da2582bd1cfb4614d0ac42 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:37:41 +0100 Subject: [PATCH 20/62] add mypy fixes --- src/spatialdata/_io/_utils.py | 2 +- src/spatialdata/_io/io_zarr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index eae5aef67..d4247ccef 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -388,7 +388,7 @@ def save_transformations(sdata: SpatialData) -> None: StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group -def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: +def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): # if the input is str or Path, map it to UPath diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index a9d2cf986..e8a208a49 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -155,7 +155,7 @@ def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> Spa ) subgroup_name = "table" group = f[subgroup_name] - tables = _read_table(zarr_store_path=store_like, group=f, subgroup=group, tables=tables) + tables = _read_table(zarr_store_path=str(store_like), group=f, subgroup=group, tables=tables) # read attrs metadata attrs = f.attrs.asdict() From 23f4a89689fcc291def86352778f95eb3c8efa05 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Mar 2025 10:42:40 +0000 Subject: [PATCH 21/62] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_io/io_zarr.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 760b97655..c9ffa16e2 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,10 +1,7 @@ import logging -import os import warnings from json import JSONDecodeError -from pathlib import Path -from typing import Literal -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import zarr from anndata import AnnData @@ -12,25 +9,25 @@ from zarr.errors import ArrayNotFoundError, MetadataError from spatialdata._core.spatialdata import SpatialData -from spatialdata._io._utils import BadFileHandleMethod, handle_read_errors, ome_zarr_logger -from spatialdata._io._utils import StoreLike -from spatialdata._io._utils import _create_upath -from spatialdata._io._utils import _open_zarr_store -from spatialdata._io.io_points import _read_points +from spatialdata._io._utils import ( + BadFileHandleMethod, + StoreLike, + _create_upath, + _open_zarr_store, + handle_read_errors, + ome_zarr_logger, +) from spatialdata._io.io_raster import _read_multiscale -from spatialdata._io.io_shapes import _read_shapes -from spatialdata._io.io_table import _read_table from spatialdata._logging import logger if TYPE_CHECKING: - from xarray import DataArray - from xarray import DataTree from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame + from xarray import DataArray, DataTree def is_hidden_zarr_entry(name: str) -> bool: - """skip hidden files like .zgroup or .zmetadata""" + """Skip hidden files like .zgroup or .zmetadata""" return name.rpartition("/")[2].startswith(".") @@ -238,8 +235,7 @@ def read_zarr( if "table" in selector and "table" in f: warnings.warn( - f"Table group found in zarr store at location {store}. Please update the zarr store to use tables " - f"instead.", + f"Table group found in zarr store at location {store}. Please update the zarr store to use tables instead.", DeprecationWarning, stacklevel=2, ) From a80588c021ba3e3e4e313c53478b464c43136462 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 11:47:07 +0100 Subject: [PATCH 22/62] Fix linting errors --- src/spatialdata/_io/io_zarr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index c9ffa16e2..a0377c360 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -27,7 +27,7 @@ def is_hidden_zarr_entry(name: str) -> bool: - """Skip hidden files like .zgroup or .zmetadata""" + """Skip hidden files like '.zgroup' or '.zmetadata'.""" return name.rpartition("/")[2].startswith(".") @@ -64,11 +64,11 @@ def read_labels_element(path: StoreLike) -> DataArray | DataTree: def read_points_element(path: StoreLike) -> DaskDataFrame: - pass + raise NotImplementedError def read_shapes_element(path: StoreLike) -> GeoDataFrame: - pass + raise NotImplementedError def read_tables_element( @@ -78,7 +78,7 @@ def read_tables_element( tables: dict[str, AnnData], on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR, ) -> dict[str, AnnData]: - pass + raise NotImplementedError def read_zarr( From 020810bb6f9d4270980567aa5afbb3551960862f Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 14:21:36 +0100 Subject: [PATCH 23/62] fixed majority of tests --- pyproject.toml | 2 +- src/spatialdata/_io/_utils.py | 2 ++ src/spatialdata/_io/io_table.py | 8 ++------ src/spatialdata/_io/io_zarr.py | 20 +++++++++++++++++--- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2163c7c5a..4ef7af057 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "numba>=0.55.0", "numpy", "ome_zarr>=0.8.4", - "universal_pathlib>=0.2.0", + "universal_pathlib>=0.2.6", "pandas", "pooch", "pyarrow", diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 49e400e9b..35a803e15 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -432,6 +432,8 @@ def _create_upath(path: StoreLike) -> UPath: if hasattr(path, "store") and isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): # create a url from the ConsolidatedMetadataStore and append it with the path from the Group StoreLike object return UPath(path.store.store.path) / path.path + if isinstance(path, zarr.storage.BaseStore): + return UPath(path.path) # best effort to create a UPath return UPath(path) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 023129d50..904eeba08 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os from json import JSONDecodeError from typing import Literal @@ -48,22 +47,19 @@ def _read_table( count = 0 for table_name in subgroup: f_elem = subgroup[table_name] - f_elem_store = os.path.join(zarr_store_path, f_elem.path) with handle_read_errors( on_bad_files=on_bad_files, location=f"{subgroup.path}/{table_name}", exc_types=(JSONDecodeError, KeyError, ValueError, ArrayNotFoundError), ): - tables[table_name] = read_anndata_zarr(f_elem_store) + tables[table_name] = read_anndata_zarr(f_elem) - f = zarr.open(f_elem_store, mode="r") - version = _parse_version(f, expect_attrs_key=False) + version = _parse_version(f_elem, expect_attrs_key=False) assert version is not None # since have just one table format, we currently read it but do not use it; if we ever change the format # we can rename the two _ to format and implement the per-format read logic (as we do for shapes) _ = TablesFormats[version] - f.store.close() # # replace with format from above # version = "0.1" diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index a0377c360..351e1dec2 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import warnings from json import JSONDecodeError @@ -17,7 +19,10 @@ handle_read_errors, ome_zarr_logger, ) +from spatialdata._io.io_points import _read_points from spatialdata._io.io_raster import _read_multiscale +from spatialdata._io.io_shapes import _read_shapes +from spatialdata._io.io_table import _read_table from spatialdata._logging import logger if TYPE_CHECKING: @@ -64,11 +69,13 @@ def read_labels_element(path: StoreLike) -> DataArray | DataTree: def read_points_element(path: StoreLike) -> DaskDataFrame: - raise NotImplementedError + store = _open_zarr_store(path) + return _read_points(store) def read_shapes_element(path: StoreLike) -> GeoDataFrame: - raise NotImplementedError + store = _open_zarr_store(path) + return _read_shapes(store) def read_tables_element( @@ -78,7 +85,14 @@ def read_tables_element( tables: dict[str, AnnData], on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR, ) -> dict[str, AnnData]: - raise NotImplementedError + store = _open_zarr_store(zarr_store_path) + return _read_table( + store, + group, + subgroup, + tables, + on_bad_files, + ) def read_zarr( From ba25564353a5a2516922cd3faef2737764d0c96b Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 15:29:21 +0100 Subject: [PATCH 24/62] spatialdata._io._utils: _open_zarr_store has to set dimension_separator to emulate previous ome_zarr.io.parse_url --- src/spatialdata/_io/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 35a803e15..45f048da4 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -398,7 +398,7 @@ def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use DirectoryStore - return zarr.storage.DirectoryStore(path.path) + return zarr.storage.DirectoryStore(path.path, dimension_separator="/") if isinstance(path, zarr.Group): # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, zarr.storage.DirectoryStore): From b2ff8f847b6fe374788369ef624c4898f92661c3 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 16:07:58 +0100 Subject: [PATCH 25/62] stay in sync with ome zarr format --- src/spatialdata/_io/io_zarr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 351e1dec2..157599ba9 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -48,7 +48,9 @@ def read_image_element(path: StoreLike) -> DataArray | DataTree: ------- A DataArray or DataTree object. """ - store = _open_zarr_store(path) + # stay in sync with ome v4 format spec: + # https://github.com/ome/ome-zarr-py/blob/7d1ae35c97/ome_zarr/format.py#L189-L192 + store = _open_zarr_store(path, dimension_separator="/", normalize_keys=False) return _read_multiscale(store, raster_type="image") From 70480cea04e84858314421244af6dbb15d53b775 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 17:02:24 +0100 Subject: [PATCH 26/62] spatialdata._io.io_raster: support remote stores --- src/spatialdata/_io/io_raster.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 345619d37..824b17ff0 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -18,7 +18,6 @@ from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, - _open_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version @@ -33,12 +32,10 @@ ) -def _read_multiscale( - store: str | UPath | zarr.storage.BaseStore, raster_type: Literal["image", "labels"] -) -> DataArray | DataTree: +def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: assert raster_type in ["image", "labels"] if isinstance(store, str | UPath): - store = _open_zarr_store(store) + raise NotImplementedError("removed in this PR") group = zarr.group(store=store) version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, @@ -47,7 +44,7 @@ def _read_multiscale( store.close() nodes: list[Node] = [] - image_loc = ZarrLocation(store) + image_loc = ZarrLocation(store, fmt=format) if image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) From 10cef3f6081d6988f1e0ae11acce60cd924b06b0 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 17:22:31 +0100 Subject: [PATCH 27/62] prevent crashing tests on 3.10 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 17d33bb16..f7b5ba3e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,8 @@ dependencies = [ "rich", "setuptools", "shapely>=2.0.1", - "spatial_image>=1.1.0", + "spatial_image>=1.2.1", + "xarray-dataclasses>=1.9.1", "scikit-image", "scipy", "typing_extensions>=4.8.0", From ac95ecf5e436fa296afc40b9122f3590e5732694 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 24 Jan 2025 10:34:26 +0100 Subject: [PATCH 28/62] add basic support and remote SpatialData tests --- pyproject.toml | 11 +- tests/io/test_remote.py | 23 ++++ tests/io/test_remote_mock.py | 249 +++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 tests/io/test_remote.py create mode 100644 tests/io/test_remote_mock.py diff --git a/pyproject.toml b/pyproject.toml index 1f16398d9..48f220124 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,14 +26,14 @@ dependencies = [ "click", "dask-image", "dask>=2024.4.1,<=2024.11.2", - "datashader", - "fsspec", + "fsspec[s3,http]", "geopandas>=0.14", "multiscale_spatial_image>=2.0.3", "networkx", "numba>=0.55.0", "numpy", "ome_zarr>=0.8.4", + "universal_pathlib>=0.2.0", "pandas", "pooch", "pyarrow", @@ -73,6 +73,13 @@ docs = [ "sphinx-copybutton", "sphinx-pytest", ] +test = [ + "pytest", + "pytest-cov", + "pytest-mock", + "torch", + "moto[s3,server]" +] benchmark = [ "asv", ] diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py new file mode 100644 index 000000000..e5ee41eff --- /dev/null +++ b/tests/io/test_remote.py @@ -0,0 +1,23 @@ +import pytest +import zarr + +from spatialdata import SpatialData + + +class TestRemote: + # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html + # These tests are disabled by default because they require internet access + @pytest.fixture(params=["merfish", "mibitof"]) + def s3_address(self, request): + urls = { + "merfish": "https://s3.embl.de/spatialdata/spatialdata-sandbox/merfish.zarr/", + "mibitof": "https://s3.embl.de/spatialdata/spatialdata-sandbox/mibitof.zarr/", + } + return urls[request.param] + + # TODO: does not work, problem with opening remote parquet + @pytest.mark.xfail(reason="Problem with opening remote parquet") + def test_remote(self, s3_address): + root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") + sdata = SpatialData.read(root) + assert len(list(sdata.gen_elements())) > 0 diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py new file mode 100644 index 000000000..a7a304c55 --- /dev/null +++ b/tests/io/test_remote_mock.py @@ -0,0 +1,249 @@ +import os +import shlex +import shutil +import subprocess +import time +import uuid +from pathlib import Path + +import fsspec +import pytest +from fsspec.implementations.local import LocalFileSystem, make_path_posix +from fsspec.registry import _registry, register_implementation +from fsspec.utils import stringify_path +from upath import UPath +from upath.implementations.cloud import S3Path + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + + +## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py +def posixify(path): + return str(path).replace("\\", "/") + + +class DummyTestFS(LocalFileSystem): + protocol = "mock" + root_marker = "/" + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("mock://"): + path = path[7:] + elif path.startswith("mock:"): + path = path[5:] + return make_path_posix(path).rstrip("/") or cls.root_marker + + +@pytest.fixture(scope="session") +def clear_registry(): + register_implementation("mock", DummyTestFS) + try: + yield + finally: + _registry.clear() + + +@pytest.fixture(scope="session") +def s3_server(): + # writable local S3 system + if "BOTO_CONFIG" not in os.environ: # pragma: no cover + os.environ["BOTO_CONFIG"] = "/dev/null" + if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + if "AWS_SECRET_ACCESS_KEY" not in os.environ: # pragma: no cover + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + if "AWS_SECURITY_TOKEN" not in os.environ: # pragma: no cover + os.environ["AWS_SECURITY_TOKEN"] = "testing" + if "AWS_SESSION_TOKEN" not in os.environ: # pragma: no cover + os.environ["AWS_SESSION_TOKEN"] = "testing" + if "AWS_DEFAULT_REGION" not in os.environ: # pragma: no cover + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + requests = pytest.importorskip("requests") + + pytest.importorskip("moto") + + port = 5555 + endpoint_uri = f"http://127.0.0.1:{port}/" + proc = subprocess.Popen( + shlex.split(f"moto_server -p {port}"), + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + try: + timeout = 5 + while timeout > 0: + try: + r = requests.get(endpoint_uri, timeout=10) + if r.ok: + break + except requests.exceptions.RequestException: # pragma: no cover + pass + timeout -= 0.1 # pragma: no cover + time.sleep(0.1) # pragma: no cover + anon = False + s3so = { + "client_kwargs": {"endpoint_url": endpoint_uri}, + "use_listings_cache": True, + } + yield anon, s3so + finally: + proc.terminate() + proc.wait() + + +@pytest.fixture(scope="function") +def s3_fixture(s3_server): + pytest.importorskip("s3fs") + anon, s3so = s3_server + s3 = fsspec.filesystem("s3", anon=False, **s3so) + random_name = uuid.uuid4().hex + bucket_name = f"test_{random_name}" + if s3.exists(bucket_name): + for dir, _, keys in s3.walk(bucket_name): + for key in keys: + s3.rm(f"{dir}/{key}") + else: + s3.mkdir(bucket_name) + # for x in Path(local_testdir).glob("**/*"): + # target_path = f"{bucket_name}/{posixify(x.relative_to(local_testdir))}" + # if x.is_file(): + # s3.upload(str(x), target_path) + s3.invalidate_cache() + yield f"s3://{bucket_name}", anon, s3so + + +@pytest.fixture(scope="session") +def http_server(tmp_path_factory): + http_tempdir = tmp_path_factory.mktemp("http") + + requests = pytest.importorskip("requests") + pytest.importorskip("http.server") + proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) + try: + url = "http://127.0.0.1:8080/folder" + path = Path(http_tempdir) / "folder" + path.mkdir() + timeout = 10 + while True: + try: + r = requests.get(url, timeout=10) + if r.ok: + yield path, url + break + except requests.exceptions.RequestException as e: # noqa: E722 + timeout -= 1 + if timeout < 0: + raise SystemError from e + time.sleep(1) + finally: + proc.terminate() + proc.wait() + + +@pytest.fixture +def http_fixture(local_testdir, http_server): + http_path, http_url = http_server + shutil.rmtree(http_path) + shutil.copytree(local_testdir, http_path) + yield http_url + + +class TestRemoteMock: + @pytest.fixture(scope="function") + def upath(self, s3_fixture): + path, anon, s3so = s3_fixture + return UPath(path, anon=anon, **s3so) + + def test_is_S3Path(self, upath): + assert isinstance(upath, S3Path) + + # # Test UPath with Moto Mocking + def test_creating_file(self, upath): + file_name = "file1" + p1 = upath / file_name + p1.touch() + contents = [p.name for p in upath.iterdir()] + assert file_name in contents + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for ImageElement not yet implemented") + def test_images(self, upath: UPath, images: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + images.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(images, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for LabelsElement not yet implemented") + def test_labels(self, upath: UPath, labels: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + labels.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(labels, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for ShapesElement not yet implemented") + def test_shapes(self, upath: UPath, shapes: SpatialData) -> None: + import numpy as np + + tmpdir = upath / "tmp.zarr" + + # check the index is correctly written and then read + shapes["circles"].index = np.arange(1, len(shapes["circles"]) + 1) + + shapes.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(shapes, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for PointsElement not yet implemented") + def test_points(self, upath: UPath, points: SpatialData) -> None: + import dask.dataframe as dd + import numpy as np + + tmpdir = upath / "tmp.zarr" + + # check the index is correctly written and then read + new_index = dd.from_array(np.arange(1, len(points["points_0"]) + 1)) + points["points_0"] = points["points_0"].set_index(new_index) + + points.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(points, sdata) + + def _test_table(self, upath: UPath, table: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + table.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(table, sdata) + + def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + store = zarr.open() + table.write(tmpdir) + # location of table + + + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(elem, sdata) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") + def test_single_table_single_annotation(self, upath: UPath, table_single_annotation: SpatialData) -> None: + self._test_table(upath, table_single_annotation) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") + def test_single_table_multiple_annotations(self, upath: UPath, table_multiple_annotations: SpatialData) -> None: + self._test_table(upath, table_multiple_annotations) + + # TODO: fix this test + @pytest.mark.xfail(reason="Fails because remote support for SpatialData not yet implemented") + def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: + tmpdir = upath / "tmp.zarr" + full_sdata.write(tmpdir) + sdata = SpatialData.read(tmpdir) + assert_spatial_data_objects_are_identical(full_sdata, sdata) From 45375fab766456a5b5cfae6d87d2f51f55efa8c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:43:46 +0000 Subject: [PATCH 29/62] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/test_remote_mock.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index a7a304c55..1966e631a 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -226,7 +226,6 @@ def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: table.write(tmpdir) # location of table - sdata = SpatialData.read(tmpdir) assert_spatial_data_objects_are_identical(elem, sdata) From ab1b0f23dd93466c2ee2c1dc1a9bbe1d9b51d09f Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 10:48:26 +0100 Subject: [PATCH 30/62] fix pre-commit --- tests/io/test_remote_mock.py | 37 +++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 1966e631a..7eb76307f 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -1,6 +1,5 @@ import os import shlex -import shutil import subprocess import time import uuid @@ -17,10 +16,11 @@ from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical - ## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -def posixify(path): - return str(path).replace("\\", "/") +# TODO: check if the tests work on Windows. If yes, we can remove this commented function and the already commented +# place where it is called +# def posixify(path): +# return str(path).replace("\\", "/") class DummyTestFS(LocalFileSystem): @@ -143,12 +143,13 @@ def http_server(tmp_path_factory): proc.wait() -@pytest.fixture -def http_fixture(local_testdir, http_server): - http_path, http_url = http_server - shutil.rmtree(http_path) - shutil.copytree(local_testdir, http_path) - yield http_url +# TODO: delete this fixture if it is not used +# @pytest.fixture +# def http_fixture(local_testdir, http_server): +# http_path, http_url = http_server +# shutil.rmtree(http_path) +# shutil.copytree(local_testdir, http_path) +# yield http_url class TestRemoteMock: @@ -221,13 +222,15 @@ def _test_table(self, upath: UPath, table: SpatialData) -> None: assert_spatial_data_objects_are_identical(table, sdata) def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - store = zarr.open() - table.write(tmpdir) - # location of table - - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(elem, sdata) + pass + # TODO: fix + # tmpdir = upath / "tmp.zarr" + # store = zarr.open() + # table.write(tmpdir) + # # location of table + # + # sdata = SpatialData.read(tmpdir) + # assert_spatial_data_objects_are_identical(elem, sdata) # TODO: fix this test @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") From c88b3a56ef7e8612bb03041b7fb575567b65b404 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 10:52:20 +0100 Subject: [PATCH 31/62] Revert "Update pyproject.toml" This reverts commit 1f8a01cf97dee96a3fa957234b2f5e8a0732d7e0. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 48f220124..ab9f52bdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ maintainers = [ urls.Documentation = "https://spatialdata.scverse.org/en/latest" urls.Source = "https://github.com/scverse/spatialdata.git" urls.Home-page = "https://github.com/scverse/spatialdata.git" -requires-python = ">=3.10" +requires-python = ">=3.10, <3.13" # include 3.13 once multiscale-spatial-image conflicts are resolved dynamic= [ "version" # allow version to be set by git tags ] From 5d016ed8a7e72ee370b10590c3b4467a22bd4a86 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 11:20:25 +0100 Subject: [PATCH 32/62] fix --- tests/io/test_remote_mock.py | 69 +++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 7eb76307f..5ddf70500 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -3,7 +3,6 @@ import subprocess import time import uuid -from pathlib import Path import fsspec import pytest @@ -115,35 +114,35 @@ def s3_fixture(s3_server): yield f"s3://{bucket_name}", anon, s3so -@pytest.fixture(scope="session") -def http_server(tmp_path_factory): - http_tempdir = tmp_path_factory.mktemp("http") - - requests = pytest.importorskip("requests") - pytest.importorskip("http.server") - proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) - try: - url = "http://127.0.0.1:8080/folder" - path = Path(http_tempdir) / "folder" - path.mkdir() - timeout = 10 - while True: - try: - r = requests.get(url, timeout=10) - if r.ok: - yield path, url - break - except requests.exceptions.RequestException as e: # noqa: E722 - timeout -= 1 - if timeout < 0: - raise SystemError from e - time.sleep(1) - finally: - proc.terminate() - proc.wait() +# TODO: delete these 2 fixtures since they are not used +# @pytest.fixture(scope="session") +# def http_server(tmp_path_factory): +# http_tempdir = tmp_path_factory.mktemp("http") +# +# requests = pytest.importorskip("requests") +# pytest.importorskip("http.server") +# proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) +# try: +# url = "http://127.0.0.1:8080/folder" +# path = Path(http_tempdir) / "folder" +# path.mkdir() +# timeout = 10 +# while True: +# try: +# r = requests.get(url, timeout=10) +# if r.ok: +# yield path, url +# break +# except requests.exceptions.RequestException as e: # noqa: E722 +# timeout -= 1 +# if timeout < 0: +# raise SystemError from e +# time.sleep(1) +# finally: +# proc.terminate() +# proc.wait() -# TODO: delete this fixture if it is not used # @pytest.fixture # def http_fixture(local_testdir, http_server): # http_path, http_url = http_server @@ -249,3 +248,17 @@ def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: full_sdata.write(tmpdir) sdata = SpatialData.read(tmpdir) assert_spatial_data_objects_are_identical(full_sdata, sdata) + + def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: + pass + tmpdir = upath / "tmp.zarr" + images.write(tmpdir) + # with tempfile.TemporaryDirectory() as local_tmpdir: + # pass + # # images.write(local_tmpdir) + # # local_sdata = SpatialData.read(local_tmpdir) + # + # pass + # + # # sdata = SpatialData.read(tmpdir) + # # assert_spatial_data_objects_are_identical(local_sdata, sdata) From 3549e16f60cbc80d8690c9ac24c27e18649476d1 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 31 Jan 2025 12:27:46 +0100 Subject: [PATCH 33/62] uploading sdata to local s3 storage --- tests/io/test_remote_mock.py | 54 +++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 5ddf70500..30b3a5feb 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -1,8 +1,10 @@ import os import shlex import subprocess +import tempfile import time import uuid +from pathlib import Path import fsspec import pytest @@ -15,11 +17,10 @@ from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical + ## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -# TODO: check if the tests work on Windows. If yes, we can remove this commented function and the already commented -# place where it is called -# def posixify(path): -# return str(path).replace("\\", "/") +def posixify(path): + return str(path).replace("\\", "/") class DummyTestFS(LocalFileSystem): @@ -94,7 +95,7 @@ def s3_server(): @pytest.fixture(scope="function") -def s3_fixture(s3_server): +def s3_fixture(s3_server, full_sdata): pytest.importorskip("s3fs") anon, s3so = s3_server s3 = fsspec.filesystem("s3", anon=False, **s3so) @@ -106,10 +107,16 @@ def s3_fixture(s3_server): s3.rm(f"{dir}/{key}") else: s3.mkdir(bucket_name) - # for x in Path(local_testdir).glob("**/*"): - # target_path = f"{bucket_name}/{posixify(x.relative_to(local_testdir))}" - # if x.is_file(): - # s3.upload(str(x), target_path) + + # write a full spatialdata object to the bucket + with tempfile.TemporaryDirectory() as tempdir: + sdata_path = Path(tempdir) / "full_sdata.zarr" + full_sdata.write(sdata_path) + for x in sdata_path.glob("**/*"): + target_path = f"{bucket_name}/full_sdata.zarr/{posixify(x.relative_to(sdata_path))}" + if x.is_file(): + s3.upload(str(x), target_path) + s3.invalidate_cache() yield f"s3://{bucket_name}", anon, s3so @@ -161,7 +168,7 @@ def test_is_S3Path(self, upath): assert isinstance(upath, S3Path) # # Test UPath with Moto Mocking - def test_creating_file(self, upath): + def test_creating_file(self, upath: UPath) -> None: file_name = "file1" p1 = upath / file_name p1.touch() @@ -250,15 +257,18 @@ def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: assert_spatial_data_objects_are_identical(full_sdata, sdata) def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: - pass - tmpdir = upath / "tmp.zarr" - images.write(tmpdir) - # with tempfile.TemporaryDirectory() as local_tmpdir: - # pass - # # images.write(local_tmpdir) - # # local_sdata = SpatialData.read(local_tmpdir) - # - # pass - # - # # sdata = SpatialData.read(tmpdir) - # # assert_spatial_data_objects_are_identical(local_sdata, sdata) + with tempfile.TemporaryDirectory() as tmpdir: + sdata_path = Path(tmpdir) / "full_sdata.zarr" + images.write(sdata_path) + local_sdata = SpatialData.read(sdata_path) # noqa: F841 + remote_path = upath / "full_sdata.zarr" # noqa: F841 + + # TODO: read a single remote image from the S3 data and add it to the local SpatialData object + # for a in remote_path.glob('**/*'): + # print(a) + # + # import zarr.storage + # from zarr.storage import FSStore + # store = zarr.storage.FSStore(fs=upath.fs, url=remote_path) + # list(store.keys()) + # store.close() From 71f8a8b7be3f82aa506d990203af602e943ff9db Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 14:28:04 +0100 Subject: [PATCH 34/62] add _open_zarr_store --- src/spatialdata/_core/spatialdata.py | 29 ++++++------- src/spatialdata/_io/_utils.py | 64 +++++----------------------- tests/io/test_remote_mock.py | 3 ++ 3 files changed, 25 insertions(+), 71 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 48f6386ca..e36f98406 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -31,10 +31,7 @@ ) from spatialdata._logging import logger from spatialdata._types import ArrayLike, Raster_T -from spatialdata._utils import ( - _deprecation_alias, - _error_message_add_element, -) +from spatialdata._utils import _deprecation_alias, _error_message_add_element from spatialdata.models import ( Image2DModel, Image3DModel, @@ -601,7 +598,7 @@ def path(self, value: Path | None) -> None: ) def _get_groups_for_element( - self, zarr_path: Path, element_type: str, element_name: str + self, zarr_path: UPath, element_type: str, element_name: str ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -621,9 +618,9 @@ def _get_groups_for_element( ------- either the existing Zarr subgroup or a new one. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") - store = parse_url(zarr_path, mode="r+").store + from spatialdata._io._utils import _open_zarr_store + + store = _open_zarr_store(zarr_path, mode="r+") root = zarr.group(store=store) if element_type not in ["images", "labels", "points", "polygons", "shapes", "tables"]: raise ValueError(f"Unknown element type {element_type}") @@ -1376,7 +1373,7 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: self.delete_element_from_disk(name) return - from spatialdata._io._utils import _backed_elements_contained_in_path + from spatialdata._io._utils import _backed_elements_contained_in_path, _open_zarr_store if self.path is None: raise ValueError("The SpatialData object is not backed by a Zarr store.") @@ -1417,7 +1414,7 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: ) # delete the element - store = parse_url(self.path, mode="r+").store + store = _open_zarr_store(self.path) root = zarr.group(store=store) root[element_type].pop(element_name) store.close() @@ -1438,7 +1435,9 @@ def _check_element_not_on_disk_with_different_type(self, element_type: str, elem ) def write_consolidated_metadata(self) -> None: - store = parse_url(self.path, mode="r+").store + from spatialdata._io._utils import _open_zarr_store + + store = _open_zarr_store(self.path) # consolidate metadata to more easily support remote reading bug in zarr. In reality, 'zmetadata' is written # instead of '.zmetadata' see discussion https://github.com/zarr-developers/zarr-python/issues/1121 zarr.consolidate_metadata(store, metadata_key=".zmetadata") @@ -1575,15 +1574,11 @@ def write_transformations(self, element_name: str | None = None) -> None: ) axes = get_axes_names(element) if isinstance(element, DataArray | DataTree): - from spatialdata._io._utils import ( - overwrite_coordinate_transformations_raster, - ) + from spatialdata._io._utils import overwrite_coordinate_transformations_raster overwrite_coordinate_transformations_raster(group=element_group, axes=axes, transformations=transformations) elif isinstance(element, DaskDataFrame | GeoDataFrame | AnnData): - from spatialdata._io._utils import ( - overwrite_coordinate_transformations_non_raster, - ) + from spatialdata._io._utils import overwrite_coordinate_transformations_non_raster overwrite_coordinate_transformations_non_raster( group=element_group, axes=axes, transformations=transformations diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 5e8eb832d..809c271b4 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -18,7 +18,10 @@ from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame +from upath import UPath +from upath.implementations.local import PosixUPath, WindowsUPath from xarray import DataArray, DataTree +from zarr.storage import FSStore from spatialdata._core.spatialdata import SpatialData from spatialdata._utils import get_pyramid_levels @@ -383,57 +386,10 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -class BadFileHandleMethod(Enum): - ERROR = "error" - WARN = "warn" - - -@contextmanager -def handle_read_errors( - on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN], - location: str, - exc_types: tuple[type[Exception], ...], -) -> Generator[None, None, None]: - """ - Handle read errors according to parameter `on_bad_files`. - - Parameters - ---------- - on_bad_files - Specifies what to do upon encountering an exception. - Allowed values are : - - - 'error', let the exception be raised. - - 'warn', convert the exception into a warning if it is one of the expected exception types. - location - String identifying the function call where the exception happened - exc_types - A tuple of expected exception classes that should be converted into warnings. - - Raises - ------ - If `on_bad_files="error"`, all encountered exceptions are raised. - If `on_bad_files="warn"`, any encountered exceptions not matching the `exc_types` are raised. - """ - on_bad_files = BadFileHandleMethod(on_bad_files) # str to enum - if on_bad_files == BadFileHandleMethod.WARN: - try: - yield - except exc_types as e: - # Extract the original filename and line number from the exception and - # create a warning from it. - exc_traceback = sys.exc_info()[-1] - last_frame, lineno = list(traceback.walk_tb(exc_traceback))[-1] - filename = last_frame.f_code.co_filename - # Include the location (element path) in the warning message. - message = f"{location}: {e.__class__.__name__}: {e.args[0]}" - warnings.warn_explicit( - message=message, - category=UserWarning, - filename=filename, - lineno=lineno, - ) - # continue - else: # on_bad_files == BadFileHandleMethod.ERROR - # Let it raise exceptions - yield +def _open_zarr_store(path: str | UPath, **kwargs) -> zarr.storage.BaseStore: + if isinstance(path, str | Path): + path = UPath(path) + if isinstance(path, PosixUPath | WindowsUPath): + return zarr.storage.DirectoryStore(path.path) + else: + return FSStore(path.path, fs=path.fs, **kwargs) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 30b3a5feb..208ec5cfc 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -263,6 +263,9 @@ def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> No local_sdata = SpatialData.read(sdata_path) # noqa: F841 remote_path = upath / "full_sdata.zarr" # noqa: F841 + remote_sdata = SpatialData.read(remote_path) + assert_spatial_data_objects_are_identical(local_sdata, remote_sdata) + # TODO: read a single remote image from the S3 data and add it to the local SpatialData object # for a in remote_path.glob('**/*'): # print(a) From 7f358dc1bb4e495f2e0fa76ad9ae8a5e3e301336 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:11:00 +0100 Subject: [PATCH 35/62] revert changing write function signature Co-authored-by: LucaMarconato --- src/spatialdata/_core/spatialdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index e36f98406..d700287e6 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -598,7 +598,7 @@ def path(self, value: Path | None) -> None: ) def _get_groups_for_element( - self, zarr_path: UPath, element_type: str, element_name: str + self, zarr_path: Path, element_type: str, element_name: str ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. From 9bcdd342131afa875fa1a3064dd13971f5c91d16 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:16:16 +0100 Subject: [PATCH 36/62] update _open_zarr_store with StoreLike --- src/spatialdata/_io/_utils.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 809c271b4..82d39004c 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Literal -import zarr +import zarr.storage from anndata import AnnData from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame @@ -386,10 +386,20 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -def _open_zarr_store(path: str | UPath, **kwargs) -> zarr.storage.BaseStore: +type StoreLike = str | Path | UPath | zarr.storage.StoreLike + + +def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: if isinstance(path, str | Path): + # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): + # if the input is a local path, use DirectoryStore return zarr.storage.DirectoryStore(path.path) - else: + if isinstance(path, zarr.storage.StoreLike): + # if the input already a store, wrap it in an FSStore + return FSStore(path, **kwargs) + if isinstance(path, UPath): + # if input is a remote UPath, map it to an FSStore return FSStore(path.path, fs=path.fs, **kwargs) + raise TypeError(f"Unsupported type: {type(path)}") From 822804391a490fb4b6d369212f6d21a7b38a69d6 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Fri, 31 Jan 2025 15:25:17 +0100 Subject: [PATCH 37/62] read image element from base store --- src/spatialdata/_io/io_raster.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 541be3ead..a7b06e26c 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -1,9 +1,9 @@ -from pathlib import Path from typing import Any, Literal import dask.array as da import numpy as np import zarr +import zarr.storage from ome_zarr.format import Format from ome_zarr.io import ZarrLocation from ome_zarr.reader import Label, Multiscales, Node, Reader @@ -15,16 +15,8 @@ from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff from xarray import DataArray, Dataset, DataTree -from spatialdata._io._utils import ( - _get_transformations_from_ngff_dict, - overwrite_coordinate_transformations_raster, -) -from spatialdata._io.format import ( - CurrentRasterFormat, - RasterFormats, - RasterFormatV01, - _parse_version, -) +from spatialdata._io._utils import _get_transformations_from_ngff_dict, overwrite_coordinate_transformations_raster +from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -36,16 +28,16 @@ ) -def _read_multiscale(store: str | Path, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: - assert isinstance(store, str | Path) +def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: + assert isinstance(store, zarr.storage.BaseStore) assert raster_type in ["image", "labels"] - f = zarr.open(store, mode="r") - version = _parse_version(f, expect_attrs_key=True) + group = zarr.group(store=store) + version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, # interpreting the lack of such information as the presence of the format v01 format = RasterFormatV01() if version is None else RasterFormats[version] - f.store.close() + store.close() nodes: list[Node] = [] image_loc = ZarrLocation(store) From f25b6ceab9a419ef7e2ff1963105dfe8e1328d0a Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:45:06 +0100 Subject: [PATCH 38/62] clean up remote mock tests, focus only on reading raster elements --- tests/io/test_remote_mock.py | 265 +++++++++++------------------------ 1 file changed, 83 insertions(+), 182 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 208ec5cfc..266c34973 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -8,47 +8,18 @@ import fsspec import pytest -from fsspec.implementations.local import LocalFileSystem, make_path_posix -from fsspec.registry import _registry, register_implementation -from fsspec.utils import stringify_path from upath import UPath from upath.implementations.cloud import S3Path from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical - -## Mock setup from https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py -def posixify(path): - return str(path).replace("\\", "/") - - -class DummyTestFS(LocalFileSystem): - protocol = "mock" - root_marker = "/" - - @classmethod - def _strip_protocol(cls, path): - path = stringify_path(path) - if path.startswith("mock://"): - path = path[7:] - elif path.startswith("mock:"): - path = path[5:] - return make_path_posix(path).rstrip("/") or cls.root_marker - - -@pytest.fixture(scope="session") -def clear_registry(): - register_implementation("mock", DummyTestFS) - try: - yield - finally: - _registry.clear() +# This mock setup was inspired by https://github.com/fsspec/universal_pathlib/blob/main/upath/tests/conftest.py @pytest.fixture(scope="session") def s3_server(): - # writable local S3 system + # create a writable local S3 system via moto if "BOTO_CONFIG" not in os.environ: # pragma: no cover os.environ["BOTO_CONFIG"] = "/dev/null" if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover @@ -94,184 +65,114 @@ def s3_server(): proc.wait() -@pytest.fixture(scope="function") -def s3_fixture(s3_server, full_sdata): - pytest.importorskip("s3fs") +def clear_s3(s3_server, location=None): + # clear an s3 bucket of all contents anon, s3so = s3_server - s3 = fsspec.filesystem("s3", anon=False, **s3so) - random_name = uuid.uuid4().hex - bucket_name = f"test_{random_name}" - if s3.exists(bucket_name): - for dir, _, keys in s3.walk(bucket_name): + s3 = fsspec.filesystem("s3", anon=anon, **s3so) + if location and s3.exists(location): + for d, _, keys in s3.walk(location): for key in keys: - s3.rm(f"{dir}/{key}") - else: - s3.mkdir(bucket_name) + s3.rm(f"{d}/{key}") + s3.invalidate_cache() + - # write a full spatialdata object to the bucket +def upload_to_upath(upath, sdata): + # write the object to disk via a regular path, then copy it to the UPath byte-by-byte + # useful for testing the read and write functionality separately with tempfile.TemporaryDirectory() as tempdir: - sdata_path = Path(tempdir) / "full_sdata.zarr" - full_sdata.write(sdata_path) + sdata_path = Path(tempdir) / "temp.zarr" + sdata.write(sdata_path) + # for every file in the sdata_path, copy it to the upath for x in sdata_path.glob("**/*"): - target_path = f"{bucket_name}/full_sdata.zarr/{posixify(x.relative_to(sdata_path))}" if x.is_file(): - s3.upload(str(x), target_path) + data = x.read_bytes() + destination = upath / x.relative_to(sdata_path) + destination.write_bytes(data) + +@pytest.fixture(scope="function") +def s3_fixture(s3_server): + # make a mock bucket available for testing + pytest.importorskip("s3fs") + anon, s3so = s3_server + s3 = fsspec.filesystem("s3", anon=anon, **s3so) + random_name = uuid.uuid4().hex + bucket_name = f"test_{random_name}" + clear_s3(s3_server, bucket_name) + s3.mkdir(bucket_name) + # here you could write existing test files to s3.upload if needed s3.invalidate_cache() yield f"s3://{bucket_name}", anon, s3so -# TODO: delete these 2 fixtures since they are not used -# @pytest.fixture(scope="session") -# def http_server(tmp_path_factory): -# http_tempdir = tmp_path_factory.mktemp("http") -# -# requests = pytest.importorskip("requests") -# pytest.importorskip("http.server") -# proc = subprocess.Popen(shlex.split(f"python -m http.server --directory {http_tempdir} 8080")) -# try: -# url = "http://127.0.0.1:8080/folder" -# path = Path(http_tempdir) / "folder" -# path.mkdir() -# timeout = 10 -# while True: -# try: -# r = requests.get(url, timeout=10) -# if r.ok: -# yield path, url -# break -# except requests.exceptions.RequestException as e: # noqa: E722 -# timeout -= 1 -# if timeout < 0: -# raise SystemError from e -# time.sleep(1) -# finally: -# proc.terminate() -# proc.wait() - - -# @pytest.fixture -# def http_fixture(local_testdir, http_server): -# http_path, http_url = http_server -# shutil.rmtree(http_path) -# shutil.copytree(local_testdir, http_path) -# yield http_url - - class TestRemoteMock: @pytest.fixture(scope="function") def upath(self, s3_fixture): + # create a UPath object for the mock s3 bucket path, anon, s3so = s3_fixture return UPath(path, anon=anon, **s3so) def test_is_S3Path(self, upath): assert isinstance(upath, S3Path) - # # Test UPath with Moto Mocking - def test_creating_file(self, upath: UPath) -> None: + def test_upload_sdata(self, upath, full_sdata): + tmpdir = upath / "tmp.zarr" + upload_to_upath(tmpdir, full_sdata) + assert tmpdir.exists() + assert len(list(tmpdir.glob("*"))) == 8 + + def test_creating_file(self, upath) -> None: file_name = "file1" p1 = upath / file_name p1.touch() contents = [p.name for p in upath.iterdir()] assert file_name in contents - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for ImageElement not yet implemented") - def test_images(self, upath: UPath, images: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - images.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(images, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for LabelsElement not yet implemented") - def test_labels(self, upath: UPath, labels: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - labels.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(labels, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for ShapesElement not yet implemented") - def test_shapes(self, upath: UPath, shapes: SpatialData) -> None: - import numpy as np - - tmpdir = upath / "tmp.zarr" - - # check the index is correctly written and then read - shapes["circles"].index = np.arange(1, len(shapes["circles"]) + 1) - - shapes.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(shapes, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for PointsElement not yet implemented") - def test_points(self, upath: UPath, points: SpatialData) -> None: - import dask.dataframe as dd - import numpy as np - - tmpdir = upath / "tmp.zarr" - - # check the index is correctly written and then read - new_index = dd.from_array(np.arange(1, len(points["points_0"]) + 1)) - points["points_0"] = points["points_0"].set_index(new_index) - - points.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(points, sdata) - - def _test_table(self, upath: UPath, table: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - table.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(table, sdata) - - def _test_read_elem(self, upath: UPath, table: SpatialData) -> None: - pass - # TODO: fix - # tmpdir = upath / "tmp.zarr" - # store = zarr.open() - # table.write(tmpdir) - # # location of table - # - # sdata = SpatialData.read(tmpdir) - # assert_spatial_data_objects_are_identical(elem, sdata) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") - def test_single_table_single_annotation(self, upath: UPath, table_single_annotation: SpatialData) -> None: - self._test_table(upath, table_single_annotation) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for TableElement not yet implemented") - def test_single_table_multiple_annotations(self, upath: UPath, table_multiple_annotations: SpatialData) -> None: - self._test_table(upath, table_multiple_annotations) - - # TODO: fix this test - @pytest.mark.xfail(reason="Fails because remote support for SpatialData not yet implemented") - def test_full_sdata(self, upath: UPath, full_sdata: SpatialData) -> None: - tmpdir = upath / "tmp.zarr" - full_sdata.write(tmpdir) - sdata = SpatialData.read(tmpdir) - assert_spatial_data_objects_are_identical(full_sdata, sdata) - - def test_local_sdata_remote_image(self, upath: UPath, images: SpatialData) -> None: + @pytest.mark.parametrize( + "sdata_type", + [ + "images", + "labels", + # TODO: fix remote reading support points + # "points", + # TODO: fix remote reading support shapes + # "shapes", + ], + ) + def test_reading_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: + sdata = request.getfixturevalue(sdata_type) with tempfile.TemporaryDirectory() as tmpdir: - sdata_path = Path(tmpdir) / "full_sdata.zarr" - images.write(sdata_path) - local_sdata = SpatialData.read(sdata_path) # noqa: F841 - remote_path = upath / "full_sdata.zarr" # noqa: F841 - + local_path = Path(tmpdir) / "tmp.zarr" + sdata.write(local_path) + local_sdata = SpatialData.read(local_path) + local_len = len(list(local_sdata.gen_elements())) + assert local_len > 0 + remote_path = upath / "tmp.zarr" + upload_to_upath(remote_path, sdata) remote_sdata = SpatialData.read(remote_path) + assert len(list(remote_sdata.gen_elements())) == local_len assert_spatial_data_objects_are_identical(local_sdata, remote_sdata) - # TODO: read a single remote image from the S3 data and add it to the local SpatialData object - # for a in remote_path.glob('**/*'): - # print(a) - # - # import zarr.storage - # from zarr.storage import FSStore - # store = zarr.storage.FSStore(fs=upath.fs, url=remote_path) - # list(store.keys()) - # store.close() + @pytest.mark.parametrize( + "sdata_type", + [ + # TODO: fix remote writing support images + # "images", + # TODO: fix remote writing support labels + # "labels", + # TODO: fix remote writing support points + # "points", + # TODO: fix remote writing support shapes + # "shapes", + ], + ) + def test_writing_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: + sdata = request.getfixturevalue(sdata_type) + n_elements = len(list(sdata.gen_elements())) + # test writing to a remote path + remote_path = upath / "tmp.zarr" + sdata.write(remote_path) + assert len(upath.glob("*")) == n_elements + # test reading the remotely written object + remote_sdata = SpatialData.read(remote_path) + assert_spatial_data_objects_are_identical(sdata, remote_sdata) From 7561dc15c747131485423dc50d81462992ab829d Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:45:34 +0100 Subject: [PATCH 39/62] improve remote http test, add alternative --- tests/io/test_remote.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py index e5ee41eff..dd47d464f 100644 --- a/tests/io/test_remote.py +++ b/tests/io/test_remote.py @@ -7,17 +7,19 @@ class TestRemote: # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html # These tests are disabled by default because they require internet access - @pytest.fixture(params=["merfish", "mibitof"]) + @pytest.fixture(params=["merfish", "mibitof", "mibitof_alt"]) def s3_address(self, request): urls = { "merfish": "https://s3.embl.de/spatialdata/spatialdata-sandbox/merfish.zarr/", "mibitof": "https://s3.embl.de/spatialdata/spatialdata-sandbox/mibitof.zarr/", + "mibitof_alt": "https://dl01.irc.ugent.be/spatial/mibitof/data.zarr/", } return urls[request.param] - # TODO: does not work, problem with opening remote parquet - @pytest.mark.xfail(reason="Problem with opening remote parquet") + # TODO: does not work, problem with opening version 0.6-dev + @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote(self, s3_address): root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") - sdata = SpatialData.read(root) + # TODO: remove selection once support for points, shapes and tables is added + sdata = SpatialData.read(root, selection=("images", "labels")) assert len(list(sdata.gen_elements())) > 0 From 469f171633f734a7215ed5df400f6f9d41796f38 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:46:07 +0100 Subject: [PATCH 40/62] add support for consolidated metadata store in util function, add _create_upath function --- src/spatialdata/_io/_utils.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 82d39004c..51349f0c2 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -11,7 +11,7 @@ from enum import Enum from functools import singledispatch from pathlib import Path -from typing import Any, Literal +from typing import Any, TypeAlias import zarr.storage from anndata import AnnData @@ -386,16 +386,30 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -type StoreLike = str | Path | UPath | zarr.storage.StoreLike +StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: + # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use DirectoryStore return zarr.storage.DirectoryStore(path.path) + if isinstance(path, zarr.Group): + # if the input is a zarr.Group, wrap it with a store + if isinstance(path.store, zarr.storage.DirectoryStore): + # create a simple FSStore if the store is a DirectoryStore with just the path + return FSStore(os.path.join(path.store.path, path.path), **kwargs) + if isinstance(path.store, FSStore): + # if the store within the zarr.Group is an FSStore, return it + # but extend the path of the store with that of the zarr.Group + return FSStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) + if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # if the store is a ConsolidatedMetadataStore, just return it + return path.store + raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore return FSStore(path, **kwargs) @@ -403,3 +417,14 @@ def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: # if input is a remote UPath, map it to an FSStore return FSStore(path.path, fs=path.fs, **kwargs) raise TypeError(f"Unsupported type: {type(path)}") + + +def _create_upath(path: StoreLike) -> UPath: + # try to create a UPath from the input + if isinstance(path, str | Path): + return Path(path) + if hasattr(path, "store") and isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # create a url from the ConsolidatedMetadataStore and append it with the path from the Group StoreLike object + return UPath(path.store.store.path) / path.path + # best effort to create a UPath + return UPath(path) From ce76527a51083fd94c669cc81fbb36fb91899999 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 09:56:13 +0100 Subject: [PATCH 41/62] allow for groups as store input --- src/spatialdata/_io/io_points.py | 4 +--- src/spatialdata/_io/io_raster.py | 15 +++++++++++---- src/spatialdata/_io/io_shapes.py | 16 +++------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 3106c8470..2c3193a41 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -24,9 +24,7 @@ def _read_points( store: str | Path | MutableMapping | zarr.Group, # type: ignore[type-arg] ) -> DaskDataFrame: """Read points from a zarr store.""" - assert isinstance(store, str | Path) - f = zarr.open(store, mode="r") - + f = zarr.open(store, mode="r") if isinstance(store, str | Path | MutableMapping) else store version = _parse_version(f, expect_attrs_key=True) assert version is not None format = PointsFormats[version] diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index a7b06e26c..345619d37 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -13,9 +13,14 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff +from upath import UPath from xarray import DataArray, Dataset, DataTree -from spatialdata._io._utils import _get_transformations_from_ngff_dict, overwrite_coordinate_transformations_raster +from spatialdata._io._utils import ( + _get_transformations_from_ngff_dict, + _open_zarr_store, + overwrite_coordinate_transformations_raster, +) from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names @@ -28,10 +33,12 @@ ) -def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: - assert isinstance(store, zarr.storage.BaseStore) +def _read_multiscale( + store: str | UPath | zarr.storage.BaseStore, raster_type: Literal["image", "labels"] +) -> DataArray | DataTree: assert raster_type in ["image", "labels"] - + if isinstance(store, str | UPath): + store = _open_zarr_store(store) group = zarr.group(store=store) version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index c32ce1f34..a7d52a51c 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -12,26 +12,16 @@ _write_metadata, overwrite_coordinate_transformations_non_raster, ) -from spatialdata._io.format import ( - CurrentShapesFormat, - ShapesFormats, - ShapesFormatV01, - ShapesFormatV02, - _parse_version, -) +from spatialdata._io.format import CurrentShapesFormat, ShapesFormats, ShapesFormatV01, ShapesFormatV02, _parse_version from spatialdata.models import ShapesModel, get_axes_names -from spatialdata.transformations._utils import ( - _get_transformations, - _set_transformations, -) +from spatialdata.transformations._utils import _get_transformations, _set_transformations def _read_shapes( store: str | Path | MutableMapping | zarr.Group, # type: ignore[type-arg] ) -> GeoDataFrame: """Read shapes from a zarr store.""" - assert isinstance(store, str | Path) - f = zarr.open(store, mode="r") + f = zarr.open(store, mode="r") if isinstance(store, str | Path | MutableMapping) else store version = _parse_version(f, expect_attrs_key=True) assert version is not None format = ShapesFormats[version] From df2263924b9ab722597756ebf26f7d460e2b7a78 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:10:50 +0100 Subject: [PATCH 42/62] handle consolidated metadata with upath --- src/spatialdata/_io/_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 51349f0c2..e17d0198b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -407,8 +407,12 @@ def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: # but extend the path of the store with that of the zarr.Group return FSStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # TODO: find a way to check if the consolidated metadata is still used. Probably best to wait for Zarr v3. # if the store is a ConsolidatedMetadataStore, just return it - return path.store + # get the FSStore url path from store and append it with the path from the Group StoreLike object + url = UPath(path.store.store.path + path.path) + # same as UPath option + return FSStore(url.path, fs=url.fs, **kwargs) raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore From dadc4899a35bde2d813e81a125c9422746487007 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:11:18 +0100 Subject: [PATCH 43/62] split remote reading tests between http and http with consolidated metadata --- tests/io/test_remote.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py index dd47d464f..4611a5587 100644 --- a/tests/io/test_remote.py +++ b/tests/io/test_remote.py @@ -6,7 +6,7 @@ class TestRemote: # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html - # These tests are disabled by default because they require internet access + # TODO: mark these tests so they are disabled by default because they require internet access @pytest.fixture(params=["merfish", "mibitof", "mibitof_alt"]) def s3_address(self, request): urls = { @@ -16,9 +16,16 @@ def s3_address(self, request): } return urls[request.param] - # TODO: does not work, problem with opening version 0.6-dev + # TODO: does not work for EMBL datasets, problem with opening version 0.6-dev @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote(self, s3_address): + # TODO: remove selection once support for points, shapes and tables is added + sdata = SpatialData.read(s3_address, selection=("images", "labels")) + assert len(list(sdata.gen_elements())) > 0 + + @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") + def test_remote_consolidated(self, s3_address): + # TODO: find a way to check if the consolidated metadata is actually used. Probably best to wait for Zarr v3. root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") # TODO: remove selection once support for points, shapes and tables is added sdata = SpatialData.read(root, selection=("images", "labels")) From cfb03d5ea178cb019bc7f80940ba84255a8cb9e5 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:14:13 +0100 Subject: [PATCH 44/62] remove f_store_path, support remote raster types fully and keep local support for other elements --- src/spatialdata/_io/io_zarr.py | 208 +++++++++++++++------------------ 1 file changed, 92 insertions(+), 116 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 224ef1129..0266fd9e2 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,17 +1,15 @@ import logging -import os import warnings -from json import JSONDecodeError -from pathlib import Path -from typing import Literal import zarr +import zarr.storage from anndata import AnnData -from pyarrow import ArrowInvalid -from zarr.errors import ArrayNotFoundError, MetadataError +from dask.dataframe import DataFrame as DaskDataFrame +from geopandas import GeoDataFrame +from xarray import DataArray, DataTree from spatialdata._core.spatialdata import SpatialData -from spatialdata._io._utils import BadFileHandleMethod, handle_read_errors, ome_zarr_logger +from spatialdata._io._utils import StoreLike, _create_upath, _open_zarr_store, ome_zarr_logger from spatialdata._io.io_points import _read_points from spatialdata._io.io_raster import _read_multiscale from spatialdata._io.io_shapes import _read_shapes @@ -19,32 +17,51 @@ from spatialdata._logging import logger -def _open_zarr_store(store: str | Path | zarr.Group) -> tuple[zarr.Group, str]: +def read_image_element(path: StoreLike) -> DataArray | DataTree: + """Read a single image element from a store location. + + Parameters + ---------- + path + Path to the zarr store. + + Returns + ------- + A DataArray or DataTree object. """ - Open a zarr store (on-disk or remote) and return the zarr.Group object and the path to the store. + store = _open_zarr_store(path) + return _read_multiscale(store, raster_type="image") + + +def read_labels_element(path: StoreLike) -> DataArray | DataTree: + """Read a single image element from a store location. Parameters ---------- - store - Path to the zarr store (on-disk or remote) or a zarr.Group object. + path + Path to the zarr store. Returns ------- - A tuple of the zarr.Group object and the path to the store. + A DataArray or DataTree object. """ - f = store if isinstance(store, zarr.Group) else zarr.open(store, mode="r") - # workaround: .zmetadata is being written as zmetadata (https://github.com/zarr-developers/zarr-python/issues/1121) - if isinstance(store, str | Path) and str(store).startswith("http") and len(f) == 0: - f = zarr.open_consolidated(store, mode="r", metadata_key="zmetadata") - f_store_path = f.store.store.path if isinstance(f.store, zarr.storage.ConsolidatedMetadataStore) else f.store.path - return f, f_store_path - - -def read_zarr( - store: str | Path | zarr.Group, - selection: None | tuple[str] = None, - on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR, -) -> SpatialData: + store = _open_zarr_store(path) + return _read_multiscale(store, raster_type="labels") + + +def read_points_element() -> DaskDataFrame: + pass + + +def read_shapes_element() -> GeoDataFrame: + pass + + +def read_table_element() -> AnnData: + pass + + +def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> SpatialData: """ Read a SpatialData dataset from a zarr store (on-disk or remote). @@ -71,7 +88,10 @@ def read_zarr( ------- A SpatialData object. """ - f, f_store_path = _open_zarr_store(store) + store = _open_zarr_store(store_like) + f = zarr.group(store) + # TODO: remove table once deprecated. + selector = {"images", "labels", "points", "shapes", "tables", "table"} if not selection else set(selection or []) images = {} labels = {} @@ -79,39 +99,34 @@ def read_zarr( tables: dict[str, AnnData] = {} shapes = {} - # TODO: remove table once deprecated. - selector = {"images", "labels", "points", "shapes", "tables", "table"} if not selection else set(selection or []) logger.debug(f"Reading selection {selector}") # read multiscale images if "images" in selector and "images" in f: - with handle_read_errors( - on_bad_files, - location="images", - exc_types=(JSONDecodeError, MetadataError), - ): - group = f["images"] + group = f.images + count = 0 + for subgroup_name in group: + if subgroup_name.startswith("."): + # skip hidden files like .zgroup or .zmetadata + continue + f_elem = group[subgroup_name] + element = read_image_element(f_elem) + images[subgroup_name] = element + count += 1 + logger.debug(f"Found {count} elements in {group}") + + # read multiscale labels + with ome_zarr_logger(logging.ERROR): + if "labels" in selector and "labels" in f: + group = f.labels count = 0 for subgroup_name in group: - if Path(subgroup_name).name.startswith("."): + if subgroup_name.startswith("."): # skip hidden files like .zgroup or .zmetadata continue f_elem = group[subgroup_name] - f_elem_store = os.path.join(f_store_path, f_elem.path) - with handle_read_errors( - on_bad_files, - location=f"{group.path}/{subgroup_name}", - exc_types=( - JSONDecodeError, # JSON parse error - ValueError, # ome_zarr: Unable to read the NGFF file - KeyError, # Missing JSON key - ArrayNotFoundError, # Image chunks missing - TypeError, # instead of ArrayNotFoundError, with dask>=2024.10.0 zarr<=2.18.3 - ), - ): - element = _read_multiscale(f_elem_store, raster_type="image") - images[subgroup_name] = element - count += 1 + labels[subgroup_name] = read_labels_element(f_elem) + count += 1 logger.debug(f"Found {count} elements in {group}") # read multiscale labels @@ -141,81 +156,41 @@ def read_zarr( # now read rest of the data if "points" in selector and "points" in f: - with handle_read_errors( - on_bad_files, - location="points", - exc_types=(JSONDecodeError, MetadataError), - ): - group = f["points"] - count = 0 - for subgroup_name in group: - f_elem = group[subgroup_name] - if Path(subgroup_name).name.startswith("."): - # skip hidden files like .zgroup or .zmetadata - continue - f_elem_store = os.path.join(f_store_path, f_elem.path) - with handle_read_errors( - on_bad_files, - location=f"{group.path}/{subgroup_name}", - exc_types=(JSONDecodeError, KeyError, ArrowInvalid), - ): - points[subgroup_name] = _read_points(f_elem_store) - count += 1 - logger.debug(f"Found {count} elements in {group}") + group = f["points"] + count = 0 + for subgroup_name in group: + if subgroup_name.startswith("."): + # skip hidden files like .zgroup or .zmetadata + continue + f_elem = group[subgroup_name] + points[subgroup_name] = _read_points(f_elem) + count += 1 + logger.debug(f"Found {count} elements in {group}") if "shapes" in selector and "shapes" in f: - with handle_read_errors( - on_bad_files, - location="shapes", - exc_types=(JSONDecodeError, MetadataError), - ): - group = f["shapes"] - count = 0 - for subgroup_name in group: - if Path(subgroup_name).name.startswith("."): - # skip hidden files like .zgroup or .zmetadata - continue - f_elem = group[subgroup_name] - f_elem_store = os.path.join(f_store_path, f_elem.path) - with handle_read_errors( - on_bad_files, - location=f"{group.path}/{subgroup_name}", - exc_types=( - JSONDecodeError, - ValueError, - KeyError, - ArrayNotFoundError, - ), - ): - shapes[subgroup_name] = _read_shapes(f_elem_store) - count += 1 - logger.debug(f"Found {count} elements in {group}") + group = f["shapes"] + count = 0 + for subgroup_name in group: + if subgroup_name.startswith("."): + # skip hidden files like .zgroup or .zmetadata + continue + f_elem = group[subgroup_name] + shapes[subgroup_name] = _read_shapes(f_elem) + count += 1 + logger.debug(f"Found {count} elements in {group}") if "tables" in selector and "tables" in f: - with handle_read_errors( - on_bad_files, - location="tables", - exc_types=(JSONDecodeError, MetadataError), - ): - group = f["tables"] - tables = _read_table(f_store_path, f, group, tables, on_bad_files=on_bad_files) + group = f["tables"] + tables = _read_table(zarr_store_path=str(store_like), group=f, subgroup=group, tables=tables) if "table" in selector and "table" in f: warnings.warn( - f"Table group found in zarr store at location {f_store_path}. Please update the zarr store to use tables " - f"instead.", + f"Table group found in zarr store at location {f}. Please update the zarr store to use tables instead.", DeprecationWarning, stacklevel=2, ) subgroup_name = "table" - with handle_read_errors( - on_bad_files, - location=subgroup_name, - exc_types=(JSONDecodeError, MetadataError), - ): - group = f[subgroup_name] - tables = _read_table(f_store_path, f, group, tables, on_bad_files=on_bad_files) - - logger.debug(f"Found {count} elements in {group}") + group = f[subgroup_name] + tables = _read_table(zarr_store_path=store_like, group=f, subgroup=group, tables=tables) # read attrs metadata attrs = f.attrs.asdict() @@ -234,5 +209,6 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = Path(store) + # TODO: create a UPath object from any StoreLike object + sdata.path = _create_upath(store_like) return sdata From 813ff9090f5418d1989e1a24d30a0e376b6d4b11 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:35:36 +0100 Subject: [PATCH 45/62] Fix metadata_key bug now that store is not always FSStore. Add extra comments. --- src/spatialdata/_core/spatialdata.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index d700287e6..4ae8bc610 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1438,14 +1438,21 @@ def write_consolidated_metadata(self) -> None: from spatialdata._io._utils import _open_zarr_store store = _open_zarr_store(self.path) - # consolidate metadata to more easily support remote reading bug in zarr. In reality, 'zmetadata' is written - # instead of '.zmetadata' see discussion https://github.com/zarr-developers/zarr-python/issues/1121 - zarr.consolidate_metadata(store, metadata_key=".zmetadata") + # Note that the store can be local (which does not have the zmetadata bug) + # or a remote FSStore (which has the bug). + # Consolidate metadata to more easily support remote reading bug in zarr. + # We write 'zmetadata' instead of the standard '.zmetadata' to avoid the FSStore bug. + # See discussion https://github.com/zarr-developers/zarr-python/issues/1121 + zarr.consolidate_metadata(store, metadata_key="zmetadata") store.close() def has_consolidated_metadata(self) -> bool: + from spatialdata._io._utils import _open_zarr_store + return_value = False - store = parse_url(self.path, mode="r").store + store = _open_zarr_store(self.path) + # Note that the store can be local (which does not have the zmetadata bug) + # or a remote FSStore (which has the bug). if "zmetadata" in store: return_value = True store.close() From 94eb25f4a4c058a8bc3f908a6e3869d35a430ed3 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Sat, 1 Feb 2025 10:37:41 +0100 Subject: [PATCH 46/62] add mypy fixes --- src/spatialdata/_io/_utils.py | 2 +- src/spatialdata/_io/io_zarr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index e17d0198b..9a70cb2fc 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -389,7 +389,7 @@ def save_transformations(sdata: SpatialData) -> None: StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group -def _open_zarr_store(path: StoreLike, **kwargs) -> zarr.storage.BaseStore: +def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): # if the input is str or Path, map it to UPath diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 0266fd9e2..3cf604d25 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -190,7 +190,7 @@ def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> Spa ) subgroup_name = "table" group = f[subgroup_name] - tables = _read_table(zarr_store_path=store_like, group=f, subgroup=group, tables=tables) + tables = _read_table(zarr_store_path=str(store_like), group=f, subgroup=group, tables=tables) # read attrs metadata attrs = f.attrs.asdict() From 7ea6da222d48bfeaca8b061ce8b89a405a94dc46 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Mar 2025 10:42:40 +0000 Subject: [PATCH 47/62] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_io/io_zarr.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 3cf604d25..cb57d4b11 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,5 +1,7 @@ import logging import warnings +from json import JSONDecodeError +from typing import TYPE_CHECKING, Literal import zarr import zarr.storage @@ -9,13 +11,27 @@ from xarray import DataArray, DataTree from spatialdata._core.spatialdata import SpatialData -from spatialdata._io._utils import StoreLike, _create_upath, _open_zarr_store, ome_zarr_logger -from spatialdata._io.io_points import _read_points +from spatialdata._io._utils import ( + BadFileHandleMethod, + StoreLike, + _create_upath, + _open_zarr_store, + handle_read_errors, + ome_zarr_logger, +) from spatialdata._io.io_raster import _read_multiscale -from spatialdata._io.io_shapes import _read_shapes -from spatialdata._io.io_table import _read_table from spatialdata._logging import logger +if TYPE_CHECKING: + from dask.dataframe import DataFrame as DaskDataFrame + from geopandas import GeoDataFrame + from xarray import DataArray, DataTree + + +def is_hidden_zarr_entry(name: str) -> bool: + """Skip hidden files like .zgroup or .zmetadata""" + return name.rpartition("/")[2].startswith(".") + def read_image_element(path: StoreLike) -> DataArray | DataTree: """Read a single image element from a store location. @@ -184,7 +200,7 @@ def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> Spa if "table" in selector and "table" in f: warnings.warn( - f"Table group found in zarr store at location {f}. Please update the zarr store to use tables instead.", + f"Table group found in zarr store at location {store}. Please update the zarr store to use tables instead.", DeprecationWarning, stacklevel=2, ) From fc532a32a980c43291c0c4ee103295cdd1679065 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 11:47:07 +0100 Subject: [PATCH 48/62] Fix linting errors --- src/spatialdata/_io/io_zarr.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index cb57d4b11..7317edf0f 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -29,7 +29,7 @@ def is_hidden_zarr_entry(name: str) -> bool: - """Skip hidden files like .zgroup or .zmetadata""" + """Skip hidden files like '.zgroup' or '.zmetadata'.""" return name.rpartition("/")[2].startswith(".") @@ -65,16 +65,22 @@ def read_labels_element(path: StoreLike) -> DataArray | DataTree: return _read_multiscale(store, raster_type="labels") -def read_points_element() -> DaskDataFrame: - pass +def read_points_element(path: StoreLike) -> DaskDataFrame: + raise NotImplementedError -def read_shapes_element() -> GeoDataFrame: - pass +def read_shapes_element(path: StoreLike) -> GeoDataFrame: + raise NotImplementedError -def read_table_element() -> AnnData: - pass +def read_tables_element( + zarr_store_path: StoreLike, + group: zarr.Group, + subgroup: zarr.Group, + tables: dict[str, AnnData], + on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR, +) -> dict[str, AnnData]: + raise NotImplementedError def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> SpatialData: From 46319faad65941fd777134d526211e6b2780626c Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 14:21:36 +0100 Subject: [PATCH 49/62] fixed majority of tests --- pyproject.toml | 2 +- src/spatialdata/_io/_utils.py | 2 ++ src/spatialdata/_io/io_table.py | 10 ++++------ src/spatialdata/_io/io_zarr.py | 20 +++++++++++++++++--- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ab9f52bdb..6008dfbe9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "numba>=0.55.0", "numpy", "ome_zarr>=0.8.4", - "universal_pathlib>=0.2.0", + "universal_pathlib>=0.2.6", "pandas", "pooch", "pyarrow", diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 9a70cb2fc..21c2bc027 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -430,5 +430,7 @@ def _create_upath(path: StoreLike) -> UPath: if hasattr(path, "store") and isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): # create a url from the ConsolidatedMetadataStore and append it with the path from the Group StoreLike object return UPath(path.store.store.path) / path.path + if isinstance(path, zarr.storage.BaseStore): + return UPath(path.path) # best effort to create a UPath return UPath(path) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 92ff64b94..904eeba08 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -1,4 +1,5 @@ -import os +from __future__ import annotations + from json import JSONDecodeError from typing import Literal @@ -46,22 +47,19 @@ def _read_table( count = 0 for table_name in subgroup: f_elem = subgroup[table_name] - f_elem_store = os.path.join(zarr_store_path, f_elem.path) with handle_read_errors( on_bad_files=on_bad_files, location=f"{subgroup.path}/{table_name}", exc_types=(JSONDecodeError, KeyError, ValueError, ArrayNotFoundError), ): - tables[table_name] = read_anndata_zarr(f_elem_store) + tables[table_name] = read_anndata_zarr(f_elem) - f = zarr.open(f_elem_store, mode="r") - version = _parse_version(f, expect_attrs_key=False) + version = _parse_version(f_elem, expect_attrs_key=False) assert version is not None # since have just one table format, we currently read it but do not use it; if we ever change the format # we can rename the two _ to format and implement the per-format read logic (as we do for shapes) _ = TablesFormats[version] - f.store.close() # # replace with format from above # version = "0.1" diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 7317edf0f..4ee57c599 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import warnings from json import JSONDecodeError @@ -19,7 +21,10 @@ handle_read_errors, ome_zarr_logger, ) +from spatialdata._io.io_points import _read_points from spatialdata._io.io_raster import _read_multiscale +from spatialdata._io.io_shapes import _read_shapes +from spatialdata._io.io_table import _read_table from spatialdata._logging import logger if TYPE_CHECKING: @@ -66,11 +71,13 @@ def read_labels_element(path: StoreLike) -> DataArray | DataTree: def read_points_element(path: StoreLike) -> DaskDataFrame: - raise NotImplementedError + store = _open_zarr_store(path) + return _read_points(store) def read_shapes_element(path: StoreLike) -> GeoDataFrame: - raise NotImplementedError + store = _open_zarr_store(path) + return _read_shapes(store) def read_tables_element( @@ -80,7 +87,14 @@ def read_tables_element( tables: dict[str, AnnData], on_bad_files: Literal[BadFileHandleMethod.ERROR, BadFileHandleMethod.WARN] = BadFileHandleMethod.ERROR, ) -> dict[str, AnnData]: - raise NotImplementedError + store = _open_zarr_store(zarr_store_path) + return _read_table( + store, + group, + subgroup, + tables, + on_bad_files, + ) def read_zarr(store_like: StoreLike, selection: None | tuple[str] = None) -> SpatialData: From 075877c3e197063e95731302df238c988541c086 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 15:29:21 +0100 Subject: [PATCH 50/62] spatialdata._io._utils: _open_zarr_store has to set dimension_separator to emulate previous ome_zarr.io.parse_url --- src/spatialdata/_io/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 21c2bc027..7d1eca242 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -396,7 +396,7 @@ def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use DirectoryStore - return zarr.storage.DirectoryStore(path.path) + return zarr.storage.DirectoryStore(path.path, dimension_separator="/") if isinstance(path, zarr.Group): # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, zarr.storage.DirectoryStore): From 28837b1cfbbd656f0c978e93c005f09147f7cc23 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 16:07:58 +0100 Subject: [PATCH 51/62] stay in sync with ome zarr format --- src/spatialdata/_io/io_zarr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 4ee57c599..204b959f9 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -50,7 +50,9 @@ def read_image_element(path: StoreLike) -> DataArray | DataTree: ------- A DataArray or DataTree object. """ - store = _open_zarr_store(path) + # stay in sync with ome v4 format spec: + # https://github.com/ome/ome-zarr-py/blob/7d1ae35c97/ome_zarr/format.py#L189-L192 + store = _open_zarr_store(path, dimension_separator="/", normalize_keys=False) return _read_multiscale(store, raster_type="image") From 1df5be617db7395dc129dcd01204e6e2b2574b98 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 17:02:24 +0100 Subject: [PATCH 52/62] spatialdata._io.io_raster: support remote stores --- src/spatialdata/_io/io_raster.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 345619d37..824b17ff0 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -18,7 +18,6 @@ from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, - _open_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import CurrentRasterFormat, RasterFormats, RasterFormatV01, _parse_version @@ -33,12 +32,10 @@ ) -def _read_multiscale( - store: str | UPath | zarr.storage.BaseStore, raster_type: Literal["image", "labels"] -) -> DataArray | DataTree: +def _read_multiscale(store: zarr.storage.BaseStore, raster_type: Literal["image", "labels"]) -> DataArray | DataTree: assert raster_type in ["image", "labels"] if isinstance(store, str | UPath): - store = _open_zarr_store(store) + raise NotImplementedError("removed in this PR") group = zarr.group(store=store) version = _parse_version(group, expect_attrs_key=True) # old spatialdata datasets don't have format metadata for raster elements; this line ensure backwards compatibility, @@ -47,7 +44,7 @@ def _read_multiscale( store.close() nodes: list[Node] = [] - image_loc = ZarrLocation(store) + image_loc = ZarrLocation(store, fmt=format) if image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) From 3438bdab2b633297e1504a3d5f916b00b1a80c4d Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Mon, 17 Mar 2025 17:22:31 +0100 Subject: [PATCH 53/62] prevent crashing tests on 3.10 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6008dfbe9..16fc6ff11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,8 @@ dependencies = [ "rich", "setuptools", "shapely>=2.0.1", - "spatial_image>=1.2.3", + "spatial_image>=1.2.1", + "xarray-dataclasses>=1.9.1", "scikit-image", "scipy", "typing_extensions>=4.8.0", From 6f64cedd3e6d47f02efe6ff9109b004b595b84a3 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 20 Aug 2025 09:54:56 +0200 Subject: [PATCH 54/62] correct pyproject.toml --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cf5f51a3a..77e6edecb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ maintainers = [ urls.Documentation = "https://spatialdata.scverse.org/en/latest" urls.Source = "https://github.com/scverse/spatialdata.git" urls.Home-page = "https://github.com/scverse/spatialdata.git" -requires-python = ">=3.10, <3.13" # include 3.13 once multiscale-spatial-image conflicts are resolved +requires-python = ">=3.10" dynamic= [ "version" # allow version to be set by git tags ] @@ -26,6 +26,7 @@ dependencies = [ "click", "dask-image", "dask>=2024.4.1,<=2024.11.2", + "datashader", "fsspec[s3,http]", "geopandas>=0.14", "multiscale_spatial_image>=2.0.3", @@ -40,8 +41,7 @@ dependencies = [ "rich", "setuptools", "shapely>=2.0.1", - "spatial_image>=1.2.1", - "xarray-dataclasses>=1.9.1", + "spatial_image>=1.2.3", "scikit-image", "scipy", "typing_extensions>=4.8.0", From 008c0e42f732fd8c777d3f8541415872a9634d87 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 20 Aug 2025 10:07:56 +0200 Subject: [PATCH 55/62] remove type checking --- src/spatialdata/_io/io_zarr.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 01c045a97..d80db8b9b 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,11 +1,14 @@ import logging import warnings from json import JSONDecodeError -from typing import TYPE_CHECKING, Literal +from typing import Literal import zarr from anndata import AnnData +from dask.dataframe import DataFrame as DaskDataFrame +from geopandas import GeoDataFrame from pyarrow import ArrowInvalid +from xarray import DataArray, DataTree from zarr.errors import ArrayNotFoundError, MetadataError from spatialdata._core.spatialdata import SpatialData @@ -23,11 +26,6 @@ from spatialdata._io.io_table import _read_table from spatialdata._logging import logger -if TYPE_CHECKING: - from dask.dataframe import DataFrame as DaskDataFrame - from geopandas import GeoDataFrame - from xarray import DataArray, DataTree - def is_hidden_zarr_entry(name: str) -> bool: """Skip hidden files like '.zgroup' or '.zmetadata'.""" From 5eacf56c442adcc4bf9dd4f620edd8a415dbc9a3 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 20 Aug 2025 10:44:44 +0200 Subject: [PATCH 56/62] use sanitized _store --- src/spatialdata/_io/io_zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index d80db8b9b..a12d51b04 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -125,7 +125,7 @@ def read_zarr( A SpatialData object. """ _store = _open_zarr_store(store) - f = zarr.group(store) + f = zarr.group(_store) images = {} labels = {} From 12a47e3dfa036e45f605d56fc90235349f9d6c86 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Wed, 20 Aug 2025 11:06:43 +0200 Subject: [PATCH 57/62] Cloud fix remote embl datasets (#904) * tests: update tests to use remote paths * spatialdata._io._utils: support consolidated remote stores * spatialdata._io.format: supprot remote embl datasets on s3 --------- Co-authored-by: Wouter-Michiel Vierdag --- src/spatialdata/_io/_utils.py | 8 ++------ src/spatialdata/_io/format.py | 2 +- tests/io/test_remote.py | 19 ++++++++++--------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 698cee9ed..2f6b9abc5 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -407,12 +407,8 @@ def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: # but extend the path of the store with that of the zarr.Group return FSStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # TODO: find a way to check if the consolidated metadata is still used. Probably best to wait for Zarr v3. - # if the store is a ConsolidatedMetadataStore, just return it - # get the FSStore url path from store and append it with the path from the Group StoreLike object - url = UPath(path.store.store.path + path.path) - # same as UPath option - return FSStore(url.path, fs=url.fs, **kwargs) + # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store + return path.store.store raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore diff --git a/src/spatialdata/_io/format.py b/src/spatialdata/_io/format.py index 5ee675be6..910cc7c8b 100644 --- a/src/spatialdata/_io/format.py +++ b/src/spatialdata/_io/format.py @@ -253,7 +253,7 @@ def validate_table( def format_implementations() -> Iterator[Format]: """Return an instance of each format implementation, newest to oldest.""" yield RasterFormatV02() - # yield RasterFormatV01() # same format string as FormatV04 + yield RasterFormatV01() # same format string as FormatV04 yield FormatV04() yield FormatV03() yield FormatV02() diff --git a/tests/io/test_remote.py b/tests/io/test_remote.py index 4611a5587..c625f5eb4 100644 --- a/tests/io/test_remote.py +++ b/tests/io/test_remote.py @@ -1,32 +1,33 @@ import pytest import zarr +from upath import UPath from spatialdata import SpatialData class TestRemote: # Test actual remote datasets from https://spatialdata.scverse.org/en/latest/tutorials/notebooks/datasets/README.html - # TODO: mark these tests so they are disabled by default because they require internet access + @pytest.fixture(params=["merfish", "mibitof", "mibitof_alt"]) def s3_address(self, request): urls = { - "merfish": "https://s3.embl.de/spatialdata/spatialdata-sandbox/merfish.zarr/", - "mibitof": "https://s3.embl.de/spatialdata/spatialdata-sandbox/mibitof.zarr/", + "merfish": UPath( + "s3://spatialdata/spatialdata-sandbox/merfish.zarr", endpoint_url="https://s3.embl.de", anon=True + ), + "mibitof": UPath( + "s3://spatialdata/spatialdata-sandbox/mibitof.zarr", endpoint_url="https://s3.embl.de", anon=True + ), "mibitof_alt": "https://dl01.irc.ugent.be/spatial/mibitof/data.zarr/", } return urls[request.param] - # TODO: does not work for EMBL datasets, problem with opening version 0.6-dev - @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote(self, s3_address): # TODO: remove selection once support for points, shapes and tables is added sdata = SpatialData.read(s3_address, selection=("images", "labels")) assert len(list(sdata.gen_elements())) > 0 - @pytest.mark.skip(reason="Problem with ome_zarr on test datasets: ValueError: Version 0.6-dev not recognized") def test_remote_consolidated(self, s3_address): - # TODO: find a way to check if the consolidated metadata is actually used. Probably best to wait for Zarr v3. - root = zarr.open_consolidated(s3_address, mode="r", metadata_key="zmetadata") - # TODO: remove selection once support for points, shapes and tables is added + urlpath, storage_options = str(s3_address), getattr(s3_address, "storage_options", {}) + root = zarr.open_consolidated(urlpath, mode="r", metadata_key="zmetadata", storage_options=storage_options) sdata = SpatialData.read(root, selection=("images", "labels")) assert len(list(sdata.gen_elements())) > 0 From c76f8c461996f9e6917c2120bba73f3799126538 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Wed, 20 Aug 2025 11:21:41 +0200 Subject: [PATCH 58/62] spatialdata.io.io_shapes: fix support for remote shapes (#902) Co-authored-by: Wouter-Michiel Vierdag --- src/spatialdata/_io/io_shapes.py | 3 +-- tests/io/test_remote_mock.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index a7d52a51c..d2d0edaaa 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -40,8 +40,7 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(format, ShapesFormatV02): - path = Path(f._store.path) / f.path / "shapes.parquet" - geo_df = read_parquet(path) + geo_df = read_parquet(f.store.path, filesystem=f.store.fs) else: raise ValueError( f"Unsupported shapes format {format} from version {version}. Please update the spatialdata library." diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 266c34973..4e65cab61 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -135,8 +135,7 @@ def test_creating_file(self, upath) -> None: "labels", # TODO: fix remote reading support points # "points", - # TODO: fix remote reading support shapes - # "shapes", + "shapes", ], ) def test_reading_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: From c11752db12c498f1de725fcc67f636b27636607c Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Wed, 20 Aug 2025 11:36:40 +0200 Subject: [PATCH 59/62] spatialdata.io.io_points: fix support for remote points (#903) Co-authored-by: Wouter-Michiel Vierdag --- src/spatialdata/_io/io_points.py | 6 +----- tests/io/test_remote_mock.py | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 2c3193a41..30ddebfab 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -1,4 +1,3 @@ -import os from collections.abc import MutableMapping from pathlib import Path @@ -29,10 +28,7 @@ def _read_points( assert version is not None format = PointsFormats[version] - path = os.path.join(f._store.path, f.path, "points.parquet") - # cache on remote file needed for parquet reader to work - # TODO: allow reading in the metadata without caching all the data - points = read_parquet("simplecache::" + path if path.startswith("http") else path) + points = read_parquet(f.store.path, filesystem=f.store.fs) assert isinstance(points, DaskDataFrame) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 4e65cab61..ce4cb1a92 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -133,8 +133,7 @@ def test_creating_file(self, upath) -> None: [ "images", "labels", - # TODO: fix remote reading support points - # "points", + "points", "shapes", ], ) From f0a0c04020527bda46c25bb57822537f5d91b829 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Wed, 20 Aug 2025 11:51:23 +0200 Subject: [PATCH 60/62] spatialdata._io: test remote tables support and fix repr (#905) Co-authored-by: Wouter-Michiel Vierdag --- src/spatialdata/_core/spatialdata.py | 5 ++++- src/spatialdata/_io/_utils.py | 17 ++++++++--------- tests/io/test_remote_mock.py | 2 ++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 4ae8bc610..31fa82edb 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1065,9 +1065,12 @@ def elements_paths_on_disk(self) -> list[str]: ------- A list of paths of the elements saved in the Zarr store. """ + from spatialdata._io._utils import _open_zarr_store + if self.path is None: raise ValueError("The SpatialData object is not backed by a Zarr store.") - store = parse_url(self.path, mode="r").store + + store = _open_zarr_store(self.path) root = zarr.group(store=store) elements_in_zarr = [] diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 2f6b9abc5..5297e626b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -419,17 +419,16 @@ def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: raise TypeError(f"Unsupported type: {type(path)}") -def _create_upath(path: StoreLike) -> UPath: +def _create_upath(path: StoreLike) -> UPath | None: # try to create a UPath from the input - if isinstance(path, str | Path): - return Path(path) - if hasattr(path, "store") and isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # create a url from the ConsolidatedMetadataStore and append it with the path from the Group StoreLike object - return UPath(path.store.store.path) / path.path - if isinstance(path, zarr.storage.BaseStore): + if isinstance(path, zarr.storage.ConsolidatedMetadataStore): + path = path.store # get the fsstore from the consolidated store + if isinstance(path, FSStore): + protocol = path.fs.protocol if isinstance(path.fs.protocol, str) else path.fs.protocol[0] + return UPath(path.path, protocol=protocol, **path.fs.storage_options) + if isinstance(path, zarr.storage.DirectoryStore): return UPath(path.path) - # best effort to create a UPath - return UPath(path) + return None class BadFileHandleMethod(Enum): diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index ce4cb1a92..7ac9930ae 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -133,6 +133,8 @@ def test_creating_file(self, upath) -> None: [ "images", "labels", + "table_single_annotation", + "table_multiple_annotations", "points", "shapes", ], From a0ec5be2c56f28a4d6696ba5cc229563a8d086eb Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 20 Aug 2025 13:46:18 +0200 Subject: [PATCH 61/62] small refactor --- tests/io/test_remote_mock.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index 7ac9930ae..ff8272655 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -91,7 +91,7 @@ def upload_to_upath(upath, sdata): @pytest.fixture(scope="function") -def s3_fixture(s3_server): +def upath(s3_server): # make a mock bucket available for testing pytest.importorskip("s3fs") anon, s3so = s3_server @@ -102,16 +102,12 @@ def s3_fixture(s3_server): s3.mkdir(bucket_name) # here you could write existing test files to s3.upload if needed s3.invalidate_cache() - yield f"s3://{bucket_name}", anon, s3so + upath = UPath(f"s3://{bucket_name}", anon=anon, **s3so) + yield upath -class TestRemoteMock: - @pytest.fixture(scope="function") - def upath(self, s3_fixture): - # create a UPath object for the mock s3 bucket - path, anon, s3so = s3_fixture - return UPath(path, anon=anon, **s3so) +class TestRemoteMock: def test_is_S3Path(self, upath): assert isinstance(upath, S3Path) From c514a0b4b81d8e33be470eb20ab4f321c8c50c35 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Fri, 22 Aug 2025 16:39:34 +0200 Subject: [PATCH 62/62] add remote write support + cleanup (#973) * refactor StoreLike to types * support remote write points and shapes * cleanup * add additional remote write support * update dependency ome-zarr * revert update dependency * support remote write tables --- .../_core/query/relational_query.py | 10 +- src/spatialdata/_core/spatialdata.py | 119 ++++++------------ src/spatialdata/_io/_utils.py | 6 +- src/spatialdata/_io/io_points.py | 7 +- src/spatialdata/_io/io_shapes.py | 9 +- src/spatialdata/_io/io_zarr.py | 4 +- src/spatialdata/_types.py | 9 +- tests/conftest.py | 2 +- .../operations/test_spatialdata_operations.py | 2 +- tests/core/query/test_relational_query.py | 8 +- tests/io/test_multi_table.py | 24 ---- tests/io/test_readwrite.py | 29 +---- tests/io/test_remote_mock.py | 20 +-- 13 files changed, 86 insertions(+), 163 deletions(-) diff --git a/src/spatialdata/_core/query/relational_query.py b/src/spatialdata/_core/query/relational_query.py index 0803158ca..96b774d8f 100644 --- a/src/spatialdata/_core/query/relational_query.py +++ b/src/spatialdata/_core/query/relational_query.py @@ -716,7 +716,7 @@ def _call_join( return elements_dict, table -def match_table_to_element(sdata: SpatialData, element_name: str, table_name: str = "table") -> AnnData: +def match_table_to_element(sdata: SpatialData, element_name: str, table_name: str) -> AnnData: """ Filter the table and reorders the rows to match the instances (rows/labels) of the specified SpatialElement. @@ -738,14 +738,6 @@ def match_table_to_element(sdata: SpatialData, element_name: str, table_name: st match_element_to_table : Function to match a spatial element to a table. join_spatialelement_table : General function, to join spatial elements with a table with more control. """ - if table_name is None: - warnings.warn( - "Assumption of table with name `table` being present is being deprecated in SpatialData v0.1. " - "Please provide the name of the table as argument to table_name.", - DeprecationWarning, - stacklevel=2, - ) - table_name = "table" _, table = join_spatialelement_table( sdata=sdata, spatial_element_names=element_name, table_name=table_name, how="left", match_rows="left" ) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 31fa82edb..bcd993d70 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -17,7 +17,6 @@ from dask.delayed import Delayed from geopandas import GeoDataFrame from ome_zarr.io import parse_url -from ome_zarr.types import JSONDict from shapely import MultiPolygon, Polygon from xarray import DataArray, DataTree @@ -30,8 +29,8 @@ validate_table_attr_keys, ) from spatialdata._logging import logger -from spatialdata._types import ArrayLike, Raster_T -from spatialdata._utils import _deprecation_alias, _error_message_add_element +from spatialdata._types import ArrayLike, Raster_T, StoreLike +from spatialdata._utils import _deprecation_alias from spatialdata.models import ( Image2DModel, Image3DModel, @@ -598,7 +597,7 @@ def path(self, value: Path | None) -> None: ) def _get_groups_for_element( - self, zarr_path: Path, element_type: str, element_name: str + self, zarr_path: StoreLike, element_type: str, element_name: str ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -1205,12 +1204,16 @@ def write( :class:`~spatialdata._io.format.CurrentRasterFormat`, :class:`~spatialdata._io.format.CurrentShapesFormat`, :class:`~spatialdata._io.format.CurrentPointsFormat`, :class:`~spatialdata._io.format.CurrentTablesFormat`. """ + from spatialdata._io._utils import _open_zarr_store + if isinstance(file_path, str): file_path = Path(file_path) - self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) - self._validate_all_elements() + if isinstance(file_path, Path): + # TODO: also validate remote paths + self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) + self._validate_all_elements() - store = parse_url(file_path, mode="w").store + store = _open_zarr_store(file_path, mode="w") zarr_group = zarr.group(store=store, overwrite=overwrite) self.write_attrs(zarr_group=zarr_group) store.close() @@ -1236,20 +1239,22 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path, + zarr_container_path: StoreLike, element_type: str, element_name: str, overwrite: bool, format: SpatialDataFormat | list[SpatialDataFormat] | None = None, ) -> None: - if not isinstance(zarr_container_path, Path): + if not isinstance(zarr_container_path, StoreLike): raise ValueError( - f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." + f"zarr_container_path must be a 'StoreLike' object " + f"(str | Path | UPath | zarr.storage.StoreLike | zarr.Group), got: {type(zarr_container_path)}." + ) + if isinstance(zarr_container_path, Path): + file_path_of_element = zarr_container_path / element_type / element_name + self._validate_can_safely_write_to_path( + file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True ) - file_path_of_element = zarr_container_path / element_type / element_name - self._validate_can_safely_write_to_path( - file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True - ) root_group, element_type_group, _ = self._get_groups_for_element( zarr_path=zarr_container_path, element_type=element_type, element_name=element_name @@ -1259,14 +1264,27 @@ def _write_element( parsed = _parse_formats(formats=format) + # We pass on zarr_container_path to ensure proper paths when writing to remote system even when on windows. if element_type == "images": write_image(image=element, group=element_type_group, name=element_name, format=parsed["raster"]) elif element_type == "labels": write_labels(labels=element, group=root_group, name=element_name, format=parsed["raster"]) elif element_type == "points": - write_points(points=element, group=element_type_group, name=element_name, format=parsed["points"]) + write_points( + points=element, + group=element_type_group, + name=element_name, + zarr_container_path=zarr_container_path, + format=parsed["points"], + ) elif element_type == "shapes": - write_shapes(shapes=element, group=element_type_group, name=element_name, format=parsed["shapes"]) + write_shapes( + shapes=element, + group=element_type_group, + name=element_name, + zarr_container_path=zarr_container_path, + format=parsed["shapes"], + ) elif element_type == "tables": write_table(table=element, group=element_type_group, name=element_name, format=parsed["tables"]) else: @@ -1797,41 +1815,16 @@ def table(self) -> None | AnnData: ------- The table. """ - warnings.warn( - "Table accessor will be deprecated with SpatialData version 0.1, use sdata.tables instead.", - DeprecationWarning, - stacklevel=2, - ) - # Isinstance will still return table if anndata has 0 rows. - if isinstance(self.tables.get("table"), AnnData): - return self.tables["table"] - return None + raise AttributeError("The property 'table' is deprecated. use '.tables' instead.") @table.setter def table(self, table: AnnData) -> None: - warnings.warn( - "Table setter will be deprecated with SpatialData version 0.1, use tables instead.", - DeprecationWarning, - stacklevel=2, - ) - TableModel().validate(table) - if self.tables.get("table") is not None: - raise ValueError("The table already exists. Use del sdata.tables['table'] to remove it first.") - self.tables["table"] = table + raise AttributeError("The property 'table' is deprecated. use '.tables' instead.") @table.deleter def table(self) -> None: """Delete the table.""" - warnings.warn( - "del sdata.table will be deprecated with SpatialData version 0.1, use del sdata.tables['table'] instead.", - DeprecationWarning, - stacklevel=2, - ) - if self.tables.get("table"): - del self.tables["table"] - else: - # More informative than the error in the zarr library. - raise KeyError("table with name 'table' not present in the SpatialData object.") + raise AttributeError("The property 'table' is deprecated. use '.tables' instead.") @staticmethod def read(file_path: Path | str, selection: tuple[str] | None = None) -> SpatialData: @@ -1853,44 +1846,6 @@ def read(file_path: Path | str, selection: tuple[str] | None = None) -> SpatialD return read_zarr(file_path, selection=selection) - def add_image( - self, - name: str, - image: DataArray | DataTree, - storage_options: JSONDict | list[JSONDict] | None = None, - overwrite: bool = False, - ) -> None: - """Deprecated. Use `sdata[name] = image` instead.""" # noqa: D401 - _error_message_add_element() - - def add_labels( - self, - name: str, - labels: DataArray | DataTree, - storage_options: JSONDict | list[JSONDict] | None = None, - overwrite: bool = False, - ) -> None: - """Deprecated. Use `sdata[name] = labels` instead.""" # noqa: D401 - _error_message_add_element() - - def add_points( - self, - name: str, - points: DaskDataFrame, - overwrite: bool = False, - ) -> None: - """Deprecated. Use `sdata[name] = points` instead.""" # noqa: D401 - _error_message_add_element() - - def add_shapes( - self, - name: str, - shapes: GeoDataFrame, - overwrite: bool = False, - ) -> None: - """Deprecated. Use `sdata[name] = shapes` instead.""" # noqa: D401 - _error_message_add_element() - @property def images(self) -> Images: """Return images as a Dict of name to image data.""" diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 5297e626b..e1756d6bd 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -11,7 +11,7 @@ from enum import Enum from functools import singledispatch from pathlib import Path -from typing import Any, Literal, TypeAlias +from typing import Any, Literal import zarr.storage from anndata import AnnData @@ -24,6 +24,7 @@ from zarr.storage import FSStore from spatialdata._core.spatialdata import SpatialData +from spatialdata._types import StoreLike from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import ( MappingToCoordinateSystem_t, @@ -386,9 +387,6 @@ def save_transformations(sdata: SpatialData) -> None: sdata.write_transformations() -StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group - - def _open_zarr_store(path: StoreLike, **kwargs: Any) -> zarr.storage.BaseStore: # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 30ddebfab..feed13e3e 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -12,6 +12,7 @@ overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version +from spatialdata._types import StoreLike from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -44,6 +45,7 @@ def write_points( points: DaskDataFrame, group: zarr.Group, name: str, + zarr_container_path: StoreLike, group_type: str = "ngff:points", format: Format = CurrentPointsFormat(), ) -> None: @@ -51,7 +53,8 @@ def write_points( t = _get_transformations(points) points_groups = group.require_group(name) - path = Path(points_groups._store.path) / points_groups.path / "points.parquet" + store = points_groups._store + path = zarr_container_path / points_groups.path / "points.parquet" # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is # 'category', it checks whether the categories of this column are known. If not, it explicitly converts the @@ -64,7 +67,7 @@ def write_points( c = c.cat.as_known() points[column_name] = c - points.to_parquet(path) + points.to_parquet(path, filesystem=getattr(store, "fs", None)) attrs = format.attrs_to_dict(points.attrs) attrs["version"] = format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index d2d0edaaa..514912c78 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -13,6 +13,7 @@ overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentShapesFormat, ShapesFormats, ShapesFormatV01, ShapesFormatV02, _parse_version +from spatialdata._types import StoreLike from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import _get_transformations, _set_transformations @@ -55,6 +56,7 @@ def write_shapes( shapes: GeoDataFrame, group: zarr.Group, name: str, + zarr_container_path: StoreLike, group_type: str = "ngff:shapes", format: Format = CurrentShapesFormat(), ) -> None: @@ -82,8 +84,11 @@ def write_shapes( attrs = format.attrs_to_dict(geometry) attrs["version"] = format.spatialdata_format_version elif isinstance(format, ShapesFormatV02): - path = Path(shapes_group._store.path) / shapes_group.path / "shapes.parquet" - shapes.to_parquet(path) + store = shapes_group._store + path = zarr_container_path / shapes_group.path / "shapes.parquet" + + # Geopandas only allows path-like objects for local filesystems and not remote ones. + shapes.to_parquet(str(path), filesystem=getattr(store, "fs", None)) attrs = format.attrs_to_dict(shapes.attrs) attrs["version"] = format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index a12d51b04..e94f69eb6 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -14,7 +14,6 @@ from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( BadFileHandleMethod, - StoreLike, _create_upath, _open_zarr_store, handle_read_errors, @@ -25,6 +24,7 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger +from spatialdata._types import StoreLike def is_hidden_zarr_entry(name: str) -> bool: @@ -46,7 +46,7 @@ def read_image_element(path: StoreLike) -> DataArray | DataTree: """ # stay in sync with ome v4 format spec: # https://github.com/ome/ome-zarr-py/blob/7d1ae35c97/ome_zarr/format.py#L189-L192 - store = _open_zarr_store(path, dimension_separator="/", normalize_keys=False) + store = _open_zarr_store(path, normalize_keys=False) return _read_multiscale(store, raster_type="image") diff --git a/src/spatialdata/_types.py b/src/spatialdata/_types.py index 30d623a57..4b6b351d3 100644 --- a/src/spatialdata/_types.py +++ b/src/spatialdata/_types.py @@ -1,9 +1,12 @@ -from typing import Any +from pathlib import Path +from typing import Any, TypeAlias import numpy as np +import zarr +from upath import UPath from xarray import DataArray, DataTree -__all__ = ["ArrayLike", "ColorLike", "DTypeLike", "Raster_T"] +__all__ = ["ArrayLike", "ColorLike", "DTypeLike", "Raster_T", "StoreLike"] from numpy.typing import DTypeLike, NDArray @@ -12,3 +15,5 @@ Raster_T = DataArray | DataTree ColorLike = tuple[float, ...] | str + +StoreLike: TypeAlias = str | Path | UPath | zarr.storage.StoreLike | zarr.Group diff --git a/tests/conftest.py b/tests/conftest.py index cc86f9777..38b2fbb1a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -440,7 +440,7 @@ def _make_sdata_for_testing_querying_and_aggretation() -> SpatialData: table = TableModel.parse( table, region=["values_circles", "values_polygons"], region_key="region", instance_key="instance_id" ) - sdata.table = table + sdata["table"] = table return sdata diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py index db413af31..3c225dbe1 100644 --- a/tests/core/operations/test_spatialdata_operations.py +++ b/tests/core/operations/test_spatialdata_operations.py @@ -174,7 +174,7 @@ def test_filter_by_coordinate_system_also_table(full_sdata: SpatialData) -> None adata = full_sdata["table"] del adata.uns[TableModel.ATTRS_KEY] del full_sdata.tables["table"] - full_sdata.table = TableModel.parse( + full_sdata["table"] = TableModel.parse( adata, region=["circles", "poly"], region_key="annotated_shapes", diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index f0b4da7e0..1b47606b3 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -15,10 +15,14 @@ def test_match_table_to_element(sdata_query_aggregation): - matched_table = match_table_to_element(sdata=sdata_query_aggregation, element_name="values_circles") + matched_table = match_table_to_element( + sdata=sdata_query_aggregation, element_name="values_circles", table_name="table" + ) arr = np.array(list(reversed(sdata_query_aggregation["values_circles"].index))) sdata_query_aggregation["values_circles"].index = arr - matched_table_reversed = match_table_to_element(sdata=sdata_query_aggregation, element_name="values_circles") + matched_table_reversed = match_table_to_element( + sdata=sdata_query_aggregation, element_name="values_circles", table_name="table" + ) assert matched_table.obs.index.tolist() == list(reversed(matched_table_reversed.obs.index.tolist())) # TODO: add tests for labels diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index dd43cfa8d..7191f2f86 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -127,30 +127,6 @@ def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): ) full_sdata.write(tmpdir) - def test_old_accessor_deprecation(self, full_sdata, tmp_path): - # To test self._backed - tmpdir = Path(tmp_path) / "tmp.zarr" - full_sdata.write(tmpdir) - adata0 = _get_table(region="polygon") - - with pytest.warns(DeprecationWarning): - _ = full_sdata.table - with pytest.raises(ValueError): - full_sdata.table = adata0 - with pytest.warns(DeprecationWarning): - del full_sdata.table - with pytest.raises(KeyError): - del full_sdata["table"] - with pytest.warns(DeprecationWarning): - full_sdata.table = adata0 # this gets placed in sdata['table'] - - assert_equal(adata0, full_sdata["table"]) - - del full_sdata["table"] - - full_sdata.tables["my_new_table0"] = adata0 - assert full_sdata.get("table") is None - @pytest.mark.parametrize("region", ["test_shapes", "non_existing"]) def test_single_table(self, tmp_path: str, region: str): tmpdir = Path(tmp_path) / "tmp.zarr" diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index ad8c66b4c..9778b529d 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -1,4 +1,5 @@ import os +import sys import tempfile from collections.abc import Callable from pathlib import Path @@ -252,27 +253,6 @@ def test_incremental_io_on_disk( sdata.delete_element_from_disk(name) sdata.write_element(name) - def test_incremental_io_table_legacy(self, table_single_annotation: SpatialData) -> None: - s = table_single_annotation - t = s["table"][:10, :].copy() - with pytest.raises(ValueError): - s.table = t - del s["table"] - s.table = t - - with tempfile.TemporaryDirectory() as td: - f = os.path.join(td, "data.zarr") - s.write(f) - s2 = SpatialData.read(f) - assert len(s2["table"]) == len(t) - del s2["table"] - s2.table = s["table"] - assert len(s2["table"]) == len(s["table"]) - f2 = os.path.join(td, "data2.zarr") - s2.write(f2) - s3 = SpatialData.read(f2) - assert len(s3["table"]) == len(s2["table"]) - def test_io_and_lazy_loading_points(self, points): with tempfile.TemporaryDirectory() as td: f = os.path.join(td, "data.zarr") @@ -774,6 +754,7 @@ def test_incremental_writing_valid_table_name_invalid_table(tmp_path: Path): invalid_sdata.write_element("valid_name") +@pytest.mark.skipif(sys.platform == "win32", reason="Renaming fails as windows path already sees the name as invalid.") def test_reading_invalid_name(tmp_path: Path): image_name, image = next(iter(_get_images().items())) labels_name, labels = next(iter(_get_labels().items())) @@ -789,8 +770,10 @@ def test_reading_invalid_name(tmp_path: Path): ) valid_sdata.write(tmp_path / "data.zarr") # Circumvent validation at construction time and check validation happens again at writing time. - (tmp_path / "data.zarr/points" / points_name).rename(tmp_path / "data.zarr/points" / "has whitespace") - (tmp_path / "data.zarr/shapes" / shapes_name).rename(tmp_path / "data.zarr/shapes" / "non-alnum_#$%&()*+,?@") + (tmp_path / "data.zarr" / "points" / points_name).rename(tmp_path / "data.zarr" / "points" / "has whitespace") + (tmp_path / "data.zarr" / "shapes" / shapes_name).rename( + tmp_path / "data.zarr" / "shapes" / "non-alnum_#$%&()*+,?@" + ) with pytest.raises(ValidationError, match="Cannot construct SpatialData") as exc_info: read_zarr(tmp_path / "data.zarr") diff --git a/tests/io/test_remote_mock.py b/tests/io/test_remote_mock.py index ff8272655..d83eff9a5 100644 --- a/tests/io/test_remote_mock.py +++ b/tests/io/test_remote_mock.py @@ -152,14 +152,12 @@ def test_reading_mocked_elements(self, upath: UPath, sdata_type: str, request) - @pytest.mark.parametrize( "sdata_type", [ - # TODO: fix remote writing support images - # "images", - # TODO: fix remote writing support labels - # "labels", - # TODO: fix remote writing support points - # "points", - # TODO: fix remote writing support shapes - # "shapes", + "images", + "labels", + "table_single_annotation", + "table_multiple_annotations", + "points", + "shapes", ], ) def test_writing_mocked_elements(self, upath: UPath, sdata_type: str, request) -> None: @@ -168,7 +166,11 @@ def test_writing_mocked_elements(self, upath: UPath, sdata_type: str, request) - # test writing to a remote path remote_path = upath / "tmp.zarr" sdata.write(remote_path) - assert len(upath.glob("*")) == n_elements + if not sdata_type.startswith("table"): + assert len(list((remote_path / sdata_type).glob("[a-zA-Z]*"))) == n_elements + else: + assert len(list((remote_path / "tables").glob("[a-zA-Z]*"))) == n_elements + # test reading the remotely written object remote_sdata = SpatialData.read(remote_path) assert_spatial_data_objects_are_identical(sdata, remote_sdata)