Merge branch 'main' into fix_subgroup_dims_HDF

TomNicholas · web-flow · commit a2fe1a7d12c5 · 2025-01-22T10:04:50.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: "v0.8.1"
+    rev: "v0.8.6"
     hooks:
       # Run the linter.
       - id: ruff
diff --git a/ci/upstream.yml b/ci/upstream.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - xarray>=2024.10.0
+  - xarray>=2024.10.0,<2025.0.0
   - h5netcdf
   - h5py
   - hdf5
@@ -28,6 +28,6 @@ dependencies:
   - fsspec
   - pip
   - pip:
-      - icechunk>=0.1.0a7 # Installs zarr v3 as dependency
+      - icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency
       # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
       - imagecodecs-numcodecs==2024.6.1
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -11,6 +11,10 @@ New Features
 
 - Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory.
   (:issue:`167`, :pull:`227`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+- Sync with Icechunk v0.1.0a8  (:pull:`368`) By `Matthew Iannucci <https://github.com/mpiannucci>`. This also adds support
+  for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
+  is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
+  underlying data has changed since the virtual references were written.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -33,6 +37,10 @@ Bug fixes
   (:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect.
   (:issue:`364`, :pull:`366`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+- Fix bug in dmrpp reader so _FillValue is included in variables' encodings.
+  (:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
+- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.
+  (:pull:`363`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Documentation
 ~~~~~~~~~~~~~
diff --git a/docs/usage.md b/docs/usage.md
@@ -421,14 +421,16 @@ By default references are placed in separate parquet file when the total number
 We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk <virtualizarr.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
 
 ```python
-# create an icechunk store
-from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
-storage = StorageConfig.filesystem(str('combined'))
-store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
-    virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
-))
-
-combined_vds.virtualize.to_icechunk(store)
+# create an icechunk repository, session and write the virtual dataset to the session
+from icechunk import Repository, Storage, VirtualChunkContainer, local_filesystem_storage
+storage = local_filesystem_storage(str('combined'))
+
+# By default, local virtual references and public remote virtual references can be read wihtout extra configuration.
+repo = Repository.create(storage=storage)
+session = repo.writeable_session("main")
+
+# write the virtual dataset to the session with the IcechunkStore
+combined_vds.virtualize.to_icechunk(session.store)
 ```
 
 See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ classifiers = [
 requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
-    "xarray>=2024.10.0",
+    "xarray>=2024.10.0,<2025.0.0",
     "numpy>=2.0.0",
     "packaging",
     "universal-pathlib",
@@ -39,7 +39,7 @@ hdf_reader = [
     "numcodecs"
 ]
 icechunk = [
-    "icechunk>=0.1.0a7",
+    "icechunk==0.1.0a8",
 ]
 test = [
     "codecov",
@@ -103,6 +103,10 @@ ignore_missing_imports = true
 module = "ujson.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "zarr.*"
+ignore_missing_imports = true
+
 [tool.ruff]
 # Same as Black.
 line-length = 88
diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
 
@@ -39,7 +40,10 @@ def to_zarr(self, storepath: str) -> None:
         dataset_to_zarr(self.ds, storepath)
 
     def to_icechunk(
-        self, store: "IcechunkStore", append_dim: Optional[str] = None
+        self,
+        store: "IcechunkStore",
+        append_dim: Optional[str] = None,
+        last_updated_at: Optional[datetime] = None,
     ) -> None:
         """
         Write an xarray dataset to an Icechunk store.
@@ -48,10 +52,30 @@ def to_icechunk(
 
         If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
 
+        If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
+        At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
+        This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
+        This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
+
         Parameters
         ----------
         store: IcechunkStore
         append_dim: str, optional
+            When provided, specifies the dimension along which to append the virtual dataset.
+        last_updated_at: datetime, optional
+            When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
+            When not provided (default), no check is performed.
+
+        Examples
+        --------
+        To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
+
+        >>> from datetime import datetime
+        >>>
+        >>> vds.virtualize.to_icechunk(
+        ...     icechunkstore,
+        ...     last_updated_at=datetime.now(),
+        ... )
         """
         from virtualizarr.writers.icechunk import dataset_to_icechunk
 
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -128,7 +128,7 @@ def open_virtual_dataset(
     ----------
     filepath : str, default None
         File path to open as a set of virtualized zarr arrays.
-    filetype : FileType, default None
+    filetype : FileType or str, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
         Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'zarr_v3', 'kerchunk'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
@@ -182,13 +182,16 @@ def open_virtual_dataset(
     if backend and filetype:
         raise ValueError("Cannot pass both a filetype and an explicit VirtualBackend")
 
-    if filetype is not None:
-        # if filetype is user defined, convert to FileType
-        filetype = FileType(filetype)
-    else:
+    if filetype is None:
         filetype = automatically_determine_filetype(
             filepath=filepath, reader_options=reader_options
         )
+    elif isinstance(filetype, str):
+        # if filetype is a user defined string, convert to FileType
+        filetype = FileType(filetype.lower())
+    elif not isinstance(filetype, FileType):
+        raise ValueError("Filetype must be a valid string or FileType")
+
     if backend:
         backend_cls = backend
     else:
diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py
@@ -411,7 +411,8 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable:
         attrs: dict[str, Any] = {}
         for attr_tag in var_tag.iterfind("dap:Attribute", self._NS):
             attrs.update(self._parse_attribute(attr_tag))
-        # Fill value is placed in encoding and thus removed from attributes
+        # Fill value is placed in zarr array's fill_value and variable encoding and removed from attributes
+        encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs}
         fill_value = attrs.pop("_FillValue", None)
         # create ManifestArray and ZArray
         zarray = ZArray(
@@ -423,7 +424,6 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable:
             shape=shape,
         )
         marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest)
-        encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs}
         return Variable(dims=dims.keys(), data=marr, attrs=attrs, encoding=encoding)
 
     def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py
@@ -42,7 +42,7 @@ def open_virtual_dataset(
 
         # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
         # TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
-        if loadable_variables != [] or indexes != {} or decode_times:
+        if loadable_variables or indexes:
             raise NotImplementedError(
                 "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
             )
diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -1,10 +1,18 @@
 import math
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Iterable, List, Mapping, Optional, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Mapping,
+    Optional,
+    Union,
+)
 
 import numpy as np
 import xarray as xr
-from xarray import Dataset, Index, Variable
 
 from virtualizarr.manifests import (
     ChunkEntry,
@@ -22,17 +30,15 @@
 from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions, soft_import
 from virtualizarr.zarr import ZArray
 
-if TYPE_CHECKING:
-    import h5py  # type: ignore
-    from h5py import Dataset, Group  # type: ignore
-
 h5py = soft_import("h5py", "For reading hdf files", strict=False)
-if h5py:
-    Dataset = h5py.Dataset  # type: ignore
-    Group = h5py.Group  # type: ignore
+
+
+if TYPE_CHECKING:
+    from h5py import Dataset as H5Dataset  # type: ignore[import-untyped]
+    from h5py import Group as H5Group  # type: ignore[import-untyped]
 else:
-    Dataset = dict()  # type: ignore
-    Group = dict()  # type: ignore
+    H5Dataset: Any = None
+    H5Group: Any = None
 
 
 class HDFVirtualBackend(VirtualBackend):
@@ -43,7 +49,7 @@ def open_virtual_dataset(
         drop_variables: Iterable[str] | None = None,
         loadable_variables: Iterable[str] | None = None,
         decode_times: bool | None = None,
-        indexes: Mapping[str, Index] | None = None,
+        indexes: Mapping[str, xr.Index] | None = None,
         virtual_backend_kwargs: Optional[dict] = None,
         reader_options: Optional[dict] = None,
     ) -> xr.Dataset:
@@ -92,7 +98,10 @@ def open_virtual_dataset(
         )
 
     @staticmethod
-    def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManifest]:
+    def _dataset_chunk_manifest(
+        path: str,
+        dataset: H5Dataset,
+    ) -> Optional[ChunkManifest]:
         """
         Generate ChunkManifest for HDF5 dataset.
 
@@ -116,7 +125,7 @@ def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManife
                 key_list = [0] * (len(dataset.shape) or 1)
                 key = ".".join(map(str, key_list))
 
-                chunk_entry = ChunkEntry.with_validation(
+                chunk_entry: ChunkEntry = ChunkEntry.with_validation(  # type: ignore[attr-defined]
                     path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
                 )
                 chunk_key = ChunkKey(key)
@@ -160,7 +169,7 @@ def add_chunk_info(blob):
             return chunk_manifest
 
     @staticmethod
-    def _dataset_dims(dataset: Dataset, group: str = "") -> List[str]:
+    def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
         """
         Get a list of dimension scale names attached to input HDF5 dataset.
 
@@ -208,7 +217,7 @@ def _dataset_dims(dataset: Dataset, group: str = "") -> List[str]:
         return [dim.removeprefix(group) for dim in dims]
 
     @staticmethod
-    def _extract_attrs(h5obj: Union[Dataset, Group]):
+    def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
         """
         Extract attributes from an HDF5 group or dataset.
 
@@ -256,7 +265,7 @@ def _extract_attrs(h5obj: Union[Dataset, Group]):
     @staticmethod
     def _dataset_to_variable(
         path: str,
-        dataset: Dataset,
+        dataset: H5Dataset,
         group: str,
     ) -> Optional[Variable]:
         """
@@ -311,9 +320,9 @@ def _dataset_to_variable(
         manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
         if manifest:
             marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-            variable = Variable(data=marray, dims=dims, attrs=attrs)
+            variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
         else:
-            variable = Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
+            variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
         return variable
 
     @staticmethod
@@ -324,7 +333,7 @@ def _virtual_vars_from_hdf(
         reader_options: Optional[dict] = {
             "storage_options": {"key": "", "secret": "", "anon": True}
         },
-    ) -> Dict[str, Variable]:
+    ) -> Dict[str, xr.Variable]:
         """
         Extract xarray Variables with ManifestArray data from an HDF file or group
 
@@ -364,7 +373,7 @@ def _virtual_vars_from_hdf(
         variables = {}
         for key in g.keys():
             if key not in drop_variables:
-                if isinstance(g[key], Dataset):
+                if isinstance(g[key], h5py.Dataset):
                     variable = HDFVirtualBackend._dataset_to_variable(
                         path=path,
                         dataset=g[key],
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -9,7 +9,7 @@
 from virtualizarr.manifests.manifest import join
 from virtualizarr.zarr import ZArray, ceildiv
 
-network = pytest.mark.network
+requires_network = pytest.mark.network
 
 
 def _importorskip(
diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -15,8 +15,8 @@
 from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     has_astropy,
-    network,
     requires_kerchunk,
+    requires_network,
     requires_s3fs,
     requires_scipy,
 )
@@ -193,7 +193,7 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
         assert set(vds.coords) == set(expected_coords)
 
 
-@network
+@requires_network
 @requires_s3fs
 class TestReadFromS3:
     @pytest.mark.parametrize(
@@ -216,7 +216,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):
             assert isinstance(vds[var].data, ManifestArray), var
 
 
-@network
+@requires_network
 @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
 class TestReadFromURL:
     @pytest.mark.parametrize(
@@ -284,6 +284,7 @@ def test_read_from_url(self, hdf_backend, filetype, url):
             vds = open_virtual_dataset(url, indexes={})
             assert isinstance(vds, xr.Dataset)
 
+    @pytest.mark.xfail(reason="often times out, as nisar file is 200MB")
     def test_virtualizarr_vs_local_nisar(self, hdf_backend):
         import fsspec
 
@@ -382,9 +383,14 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(ValueError):
             open_virtual_dataset(netcdf4_file, filetype="unknown")
 
+        with pytest.raises(ValueError):
+            open_virtual_dataset(netcdf4_file, filetype=ManifestArray)
+
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
+        open_virtual_dataset(netcdf4_file, filetype="netCDF4")
+
     def test_explicit_filetype_and_backend(self, netcdf4_file):
         with pytest.raises(ValueError):
             open_virtual_dataset(
diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py
diff --git a/virtualizarr/tests/test_readers/test_fits.py b/virtualizarr/tests/test_readers/test_fits.py
diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py
diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def open_virtual_dataset(`
`42`	`42`
`43`	`43`	`# TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly`
`44`	`44`	# TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
`45`		`- if loadable_variables != [] or indexes != {} or decode_times:`
	`45`	`+ if loadable_variables or indexes:`
`46`	`46`	`raise NotImplementedError(`
`47`	`47`	`"Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"`
`48`	`48`	`)`