Merge branch 'main' into ab/fix-fill-value-dmrpp

abarciauskas-bgse · web-flow · commit 2b32174d0f44 · 2025-01-10T10:52:35.000-08:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: "v0.8.1"
+    rev: "v0.8.6"
     hooks:
       # Run the linter.
       - id: ruff
diff --git a/ci/upstream.yml b/ci/upstream.yml
@@ -28,6 +28,6 @@ dependencies:
   - fsspec
   - pip
   - pip:
-      - icechunk>=0.1.0a7 # Installs zarr v3 as dependency
+      - icechunk>=0.1.0a8 # Installs zarr v3 as dependency
       # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
       - imagecodecs-numcodecs==2024.6.1
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -11,6 +11,10 @@ New Features
 
 - Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory.
   (:issue:`167`, :pull:`227`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+- Sync with Icechunk v0.1.0a8  (:pull:`368`) By `Matthew Iannucci <https://github.com/mpiannucci>`. This also adds support
+  for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
+  is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
+  underlying data has changed since the virtual references were written.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -33,6 +37,8 @@ Bug fixes
   (:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Fix bug in dmrpp reader so _FillValue is passed to variables' encodings.
   (:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
+- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.
+  (:pull:`363`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Documentation
 ~~~~~~~~~~~~~
diff --git a/docs/usage.md b/docs/usage.md
@@ -421,14 +421,16 @@ By default references are placed in separate parquet file when the total number
 We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk <virtualizarr.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
 
 ```python
-# create an icechunk store
-from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
-storage = StorageConfig.filesystem(str('combined'))
-store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
-    virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
-))
-
-combined_vds.virtualize.to_icechunk(store)
+# create an icechunk repository, session and write the virtual dataset to the session
+from icechunk import Repository, Storage, VirtualChunkContainer, local_filesystem_storage
+storage = local_filesystem_storage(str('combined'))
+
+# By default, local virtual references and public remote virtual references can be read wihtout extra configuration.
+repo = Repository.create(storage=storage)
+session = repo.writeable_session("main")
+
+# write the virtual dataset to the session with the IcechunkStore
+combined_vds.virtualize.to_icechunk(session.store)
 ```
 
 See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ hdf_reader = [
     "numcodecs"
 ]
 icechunk = [
-    "icechunk>=0.1.0a7",
+    "icechunk>=0.1.0a8",
 ]
 test = [
     "codecov",
@@ -103,6 +103,10 @@ ignore_missing_imports = true
 module = "ujson.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "zarr.*"
+ignore_missing_imports = true
+
 [tool.ruff]
 # Same as Black.
 line-length = 88
diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
 
@@ -39,7 +40,10 @@ def to_zarr(self, storepath: str) -> None:
         dataset_to_zarr(self.ds, storepath)
 
     def to_icechunk(
-        self, store: "IcechunkStore", append_dim: Optional[str] = None
+        self,
+        store: "IcechunkStore",
+        append_dim: Optional[str] = None,
+        last_updated_at: Optional[datetime] = None,
     ) -> None:
         """
         Write an xarray dataset to an Icechunk store.
@@ -48,10 +52,30 @@ def to_icechunk(
 
         If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
 
+        If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
+        At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
+        This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
+        This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
+
         Parameters
         ----------
         store: IcechunkStore
         append_dim: str, optional
+            When provided, specifies the dimension along which to append the virtual dataset.
+        last_updated_at: datetime, optional
+            When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
+            When not provided (default), no check is performed.
+
+        Examples
+        --------
+        To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
+
+        >>> from datetime import datetime
+        >>>
+        >>> vds.virtualize.to_icechunk(
+        ...     icechunkstore,
+        ...     last_updated_at=datetime.now(),
+        ... )
         """
         from virtualizarr.writers.icechunk import dataset_to_icechunk
 
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -128,7 +128,7 @@ def open_virtual_dataset(
     ----------
     filepath : str, default None
         File path to open as a set of virtualized zarr arrays.
-    filetype : FileType, default None
+    filetype : FileType or str, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
         Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'zarr_v3', 'kerchunk'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
@@ -182,13 +182,16 @@ def open_virtual_dataset(
     if backend and filetype:
         raise ValueError("Cannot pass both a filetype and an explicit VirtualBackend")
 
-    if filetype is not None:
-        # if filetype is user defined, convert to FileType
-        filetype = FileType(filetype)
-    else:
+    if filetype is None:
         filetype = automatically_determine_filetype(
             filepath=filepath, reader_options=reader_options
         )
+    elif isinstance(filetype, str):
+        # if filetype is a user defined string, convert to FileType
+        filetype = FileType(filetype.lower())
+    elif not isinstance(filetype, FileType):
+        raise ValueError("Filetype must be a valid string or FileType")
+
     if backend:
         backend_cls = backend
     else:
diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py
@@ -42,7 +42,7 @@ def open_virtual_dataset(
 
         # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
         # TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
-        if loadable_variables != [] or indexes != {} or decode_times:
+        if loadable_variables or indexes:
             raise NotImplementedError(
                 "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
             )
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -9,7 +9,7 @@
 from virtualizarr.manifests.manifest import join
 from virtualizarr.zarr import ZArray, ceildiv
 
-network = pytest.mark.network
+requires_network = pytest.mark.network
 
 
 def _importorskip(
diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -15,8 +15,8 @@
 from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     has_astropy,
-    network,
     requires_kerchunk,
+    requires_network,
     requires_s3fs,
     requires_scipy,
 )
@@ -193,7 +193,7 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
         assert set(vds.coords) == set(expected_coords)
 
 
-@network
+@requires_network
 @requires_s3fs
 class TestReadFromS3:
     @pytest.mark.parametrize(
@@ -216,7 +216,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):
             assert isinstance(vds[var].data, ManifestArray), var
 
 
-@network
+@requires_network
 @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
 class TestReadFromURL:
     @pytest.mark.parametrize(
@@ -383,9 +383,14 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(ValueError):
             open_virtual_dataset(netcdf4_file, filetype="unknown")
 
+        with pytest.raises(ValueError):
+            open_virtual_dataset(netcdf4_file, filetype=ManifestArray)
+
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
+        open_virtual_dataset(netcdf4_file, filetype="netCDF4")
+
     def test_explicit_filetype_and_backend(self, netcdf4_file):
         with pytest.raises(ValueError):
             open_virtual_dataset(
diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py
@@ -11,7 +11,7 @@
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests.manifest import ChunkManifest
 from virtualizarr.readers.dmrpp import DMRParser
-from virtualizarr.tests import network
+from virtualizarr.tests import requires_network
 
 urls = [
     (
@@ -177,7 +177,7 @@ def dmrparser(dmrpp_xml_str: str, tmp_path: Path, filename="test.nc") -> DMRPars
     )
 
 
-@network
+@requires_network
 @pytest.mark.parametrize("data_url, dmrpp_url", urls)
 @pytest.mark.skip(reason="Fill_val mismatch")
 def test_NASA_dmrpp(data_url, dmrpp_url):
diff --git a/virtualizarr/tests/test_readers/test_fits.py b/virtualizarr/tests/test_readers/test_fits.py
@@ -0,0 +1,23 @@
+import pytest
+from xarray import Dataset
+
+from virtualizarr import open_virtual_dataset
+from virtualizarr.tests import requires_kerchunk, requires_network
+
+pytest.importorskip("astropy")
+
+
+@requires_kerchunk
+@requires_network
+def test_open_hubble_data():
+    # data from https://registry.opendata.aws/hst/
+    vds = open_virtual_dataset(
+        "s3://stpubdata/hst/public/f05i/f05i0201m/f05i0201m_a1f.fits",
+        reader_options={"storage_options": {"anon": True}},
+    )
+
+    assert isinstance(vds, Dataset)
+    assert list(vds.variables) == ["PRIMARY"]
+    var = vds["PRIMARY"].variable
+    assert var.sizes == {"y": 17, "x": 589}
+    assert var.dtype == ">i4"
diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py
diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def open_virtual_dataset(`
`42`	`42`
`43`	`43`	`# TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly`
`44`	`44`	# TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
`45`		`- if loadable_variables != [] or indexes != {} or decode_times:`
	`45`	`+ if loadable_variables or indexes:`
`46`	`46`	`raise NotImplementedError(`
`47`	`47`	`"Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"`
`48`	`48`	`)`