diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d079bc6..c8c991c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -9,17 +9,17 @@ repos: - id: check-yaml - id: debug-statements - id: mixed-line-ending - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.291' + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: 'v0.5.0' hooks: - id: ruff args: [ "--fix" ] - repo: https://github.com/psf/black - rev: 23.9.1 + rev: 24.4.2 hooks: - id: black - repo: https://github.com/adamchainz/blacken-docs - rev: "1.16.0" + rev: "1.18.0" hooks: - id: blacken-docs additional_dependencies: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 969ff17..0ff24a3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,52 @@ Change Log ---------- +Version 1.6.1 (March 7th, 2025): + +- Let Variable.chunks return None for scalar variables, independent of what the underlying + h5ds object returns ({pull}`259`). + By `Rickard Holmberg `_ + +Version 1.6.0 (March 7th, 2025): + +- Allow specifying `h5netcdf.File(driver="h5pyd")` to force the use of h5pyd ({issue}`255`, {pull}`256`). + By `Rickard Holmberg `_ +- Add pytest-mypy-plugins for xarray nightly test ({pull}`257`). + By `Kai Mühlbauer `_ + +Version 1.5.0 (January 26th, 2025): + +- Update CI to new versions (Python 3.13, 3.14 alpha), remove numpy 1 from h5pyd runs ({pull}`250`). + By `Kai Mühlbauer `_ +- Update CI and reinstate h5pyd/hsds test runs ({pull}`247`). + By `John Readey `_ +- Allow ``zlib`` to be used as an alias for ``gzip`` for enhanced compatibility with h5netcdf's API and xarray. + By `Mark Harfouche `_ + +Version 1.4.1 (November 13th, 2024): + +- Add CI run for hdf5 1.10.6, fix complex tests, fix enum/user type tests ({pull}`244`). + By `Kai Mühlbauer `_ + + +Version 1.4.0 (October 7th, 2024): + +- Add UserType class, add EnumType ({pull}`229`). + By `Kai Mühlbauer `_ +- Refactor fillvalue and dtype handling for user types, enhance sanity checks and tests ({pull}`230`). + By `Kai Mühlbauer `_ +- Add VLType and CompoundType, commit complex compound type to file. Align with nc-complex ({pull}`227`). + By `Kai Mühlbauer `_ +- Update h5pyd testing. + By `Kai Mühlbauer `_ +- CI and lint maintenance ({pull}`235`). + By `Kai Mühlbauer `_ +- Support wrapping an h5py ``File`` object. Closing the h5netcdf file object + does not close the h5py file ({pull}`238`). + By `Thomas Kluyver `_ +- CI and lint maintenance (format README.rst, use more f-strings, change Python 3.9 to 3.10 in CI) ({pull}`239`). + By `Kai Mühlbauer `_ + Version 1.3.0 (November 7th, 2023): - Add ros3 support by checking `driver`-kwarg. diff --git a/PKG-INFO b/PKG-INFO index f93f473..bbccf9b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.2 Name: h5netcdf -Version: 1.3.0 +Version: 1.6.1 Summary: netCDF4 via h5py Author-email: Stephan Hoyer , Kai Mühlbauer Maintainer-email: h5netcdf developers @@ -45,6 +45,7 @@ Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 Classifier: Topic :: Scientific/Engineering Requires-Python: >=3.9 Description-Content-Type: text/x-rst @@ -138,32 +139,32 @@ design is an adaptation of h5py to the netCDF data model. For example: import h5netcdf import numpy as np - with h5netcdf.File('mydata.nc', 'w') as f: + with h5netcdf.File("mydata.nc", "w") as f: # set dimensions with a dictionary - f.dimensions = {'x': 5} + f.dimensions = {"x": 5} # and update them with a dict-like interface # f.dimensions['x'] = 5 # f.dimensions.update({'x': 5}) - v = f.create_variable('hello', ('x',), float) + v = f.create_variable("hello", ("x",), float) v[:] = np.ones(5) # you don't need to create groups first # you also don't need to create dimensions first if you supply data # with the new variable - v = f.create_variable('/grouped/data', ('y',), data=np.arange(10)) + v = f.create_variable("/grouped/data", ("y",), data=np.arange(10)) # access and modify attributes with a dict-like interface - v.attrs['foo'] = 'bar' + v.attrs["foo"] = "bar" # you can access variables and groups directly using a hierarchical # keys like h5py - print(f['/grouped/data']) + print(f["/grouped/data"]) # add an unlimited dimension - f.dimensions['z'] = None + f.dimensions["z"] = None # explicitly resize a dimension and all variables using it - f.resize_dimension('z', 3) + f.resize_dimension("z", 3) Notes: @@ -184,22 +185,23 @@ The legacy API is designed for compatibility with `netCDF4-python`_. To use it, .. code-block:: python import h5netcdf.legacyapi as netCDF4 + # everything here would also work with this instead: # import netCDF4 import numpy as np - with netCDF4.Dataset('mydata.nc', 'w') as ds: - ds.createDimension('x', 5) - v = ds.createVariable('hello', float, ('x',)) + with netCDF4.Dataset("mydata.nc", "w") as ds: + ds.createDimension("x", 5) + v = ds.createVariable("hello", float, ("x",)) v[:] = np.ones(5) - g = ds.createGroup('grouped') - g.createDimension('y', 10) - g.createVariable('data', 'i8', ('y',)) - v = g['data'] + g = ds.createGroup("grouped") + g.createDimension("y", 10) + g.createVariable("data", "i8", ("y",)) + v = g["data"] v[:] = np.arange(10) - v.foo = 'bar' - print(ds.groups['grouped'].variables['data']) + v.foo = "bar" + print(ds.groups["grouped"].variables["data"]) The legacy API is designed to be easy to try-out for netCDF4-python users, but it is not an exact match. Here is an incomplete list of functionality we don't include: @@ -222,9 +224,6 @@ h5py implements some features that do not (yet) result in valid netCDF files: - Data types: - Booleans - - Complex values - - Non-string variable length types - - Enum types - Reference types - Arbitrary filters: - Scale-offset filters @@ -239,11 +238,11 @@ when creating a file: .. code-block:: python # avoid the .nc extension for non-netcdf files - f = h5netcdf.File('mydata.h5', invalid_netcdf=True) + f = h5netcdf.File("mydata.h5", invalid_netcdf=True) ... # works with the legacy API, too, though compression options are not exposed - ds = h5netcdf.legacyapi.Dataset('mydata.h5', invalid_netcdf=True) + ds = h5netcdf.legacyapi.Dataset("mydata.h5", invalid_netcdf=True) ... In such cases the `_NCProperties` attribute will not be saved to the file or be removed @@ -281,7 +280,7 @@ phony dimensions according to `netCDF`_ behaviour. .. code-block:: python # mimic netCDF-behaviour for non-netcdf files - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='sort') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="sort") ... Note, that this iterates once over the whole group-hierarchy. This has affects @@ -292,7 +291,7 @@ to group access time. The created phony dimension naming will differ from .. code-block:: python - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='access') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="access") ... .. rubric:: Footnotes diff --git a/README.rst b/README.rst index e93dddb..2df3cca 100644 --- a/README.rst +++ b/README.rst @@ -80,32 +80,32 @@ design is an adaptation of h5py to the netCDF data model. For example: import h5netcdf import numpy as np - with h5netcdf.File('mydata.nc', 'w') as f: + with h5netcdf.File("mydata.nc", "w") as f: # set dimensions with a dictionary - f.dimensions = {'x': 5} + f.dimensions = {"x": 5} # and update them with a dict-like interface # f.dimensions['x'] = 5 # f.dimensions.update({'x': 5}) - v = f.create_variable('hello', ('x',), float) + v = f.create_variable("hello", ("x",), float) v[:] = np.ones(5) # you don't need to create groups first # you also don't need to create dimensions first if you supply data # with the new variable - v = f.create_variable('/grouped/data', ('y',), data=np.arange(10)) + v = f.create_variable("/grouped/data", ("y",), data=np.arange(10)) # access and modify attributes with a dict-like interface - v.attrs['foo'] = 'bar' + v.attrs["foo"] = "bar" # you can access variables and groups directly using a hierarchical # keys like h5py - print(f['/grouped/data']) + print(f["/grouped/data"]) # add an unlimited dimension - f.dimensions['z'] = None + f.dimensions["z"] = None # explicitly resize a dimension and all variables using it - f.resize_dimension('z', 3) + f.resize_dimension("z", 3) Notes: @@ -126,22 +126,23 @@ The legacy API is designed for compatibility with `netCDF4-python`_. To use it, .. code-block:: python import h5netcdf.legacyapi as netCDF4 + # everything here would also work with this instead: # import netCDF4 import numpy as np - with netCDF4.Dataset('mydata.nc', 'w') as ds: - ds.createDimension('x', 5) - v = ds.createVariable('hello', float, ('x',)) + with netCDF4.Dataset("mydata.nc", "w") as ds: + ds.createDimension("x", 5) + v = ds.createVariable("hello", float, ("x",)) v[:] = np.ones(5) - g = ds.createGroup('grouped') - g.createDimension('y', 10) - g.createVariable('data', 'i8', ('y',)) - v = g['data'] + g = ds.createGroup("grouped") + g.createDimension("y", 10) + g.createVariable("data", "i8", ("y",)) + v = g["data"] v[:] = np.arange(10) - v.foo = 'bar' - print(ds.groups['grouped'].variables['data']) + v.foo = "bar" + print(ds.groups["grouped"].variables["data"]) The legacy API is designed to be easy to try-out for netCDF4-python users, but it is not an exact match. Here is an incomplete list of functionality we don't include: @@ -164,9 +165,6 @@ h5py implements some features that do not (yet) result in valid netCDF files: - Data types: - Booleans - - Complex values - - Non-string variable length types - - Enum types - Reference types - Arbitrary filters: - Scale-offset filters @@ -181,11 +179,11 @@ when creating a file: .. code-block:: python # avoid the .nc extension for non-netcdf files - f = h5netcdf.File('mydata.h5', invalid_netcdf=True) + f = h5netcdf.File("mydata.h5", invalid_netcdf=True) ... # works with the legacy API, too, though compression options are not exposed - ds = h5netcdf.legacyapi.Dataset('mydata.h5', invalid_netcdf=True) + ds = h5netcdf.legacyapi.Dataset("mydata.h5", invalid_netcdf=True) ... In such cases the `_NCProperties` attribute will not be saved to the file or be removed @@ -223,7 +221,7 @@ phony dimensions according to `netCDF`_ behaviour. .. code-block:: python # mimic netCDF-behaviour for non-netcdf files - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='sort') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="sort") ... Note, that this iterates once over the whole group-hierarchy. This has affects @@ -234,7 +232,7 @@ to group access time. The created phony dimension naming will differ from .. code-block:: python - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='access') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="access") ... .. rubric:: Footnotes diff --git a/debian/changelog b/debian/changelog index 0eb7a50..96c90f4 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,62 @@ +python-h5netcdf (1.6.1-1) unstable; urgency=medium + + * Team upload. + * New upstream release + * Standards-Version: 4.7.2 + + -- Drew Parsons Fri, 04 Apr 2025 12:06:33 +0200 + +python-h5netcdf (1.5.0-1) unstable; urgency=medium + + * Team upload. + * New upstream release + + -- Drew Parsons Thu, 20 Feb 2025 20:12:37 +0100 + +python-h5netcdf (1.4.1-1) unstable; urgency=medium + + * Team upload. + * New upstream release + - applies debian patch fix_tests_PR244.patch + + -- Drew Parsons Sun, 24 Nov 2024 23:17:49 +0100 + +python-h5netcdf (1.4.0-3) unstable; urgency=medium + + * Team upload. + * replace debian patches 32bit_skip_complex_type_creation.patch and + test_h5py_TypeError.patch with fix_tests_PR244.patch, applying + upstream PR#244 to fix tests. Closes: #1087199. + + -- Drew Parsons Wed, 13 Nov 2024 00:08:15 +0100 + +python-h5netcdf (1.4.0-2) unstable; urgency=medium + + * Team upload. + * debian patch 32bit_skip_complex_type_creation.patch works around + 32-bit complex type test failure. See Bug#1087199. + + -- Drew Parsons Sat, 09 Nov 2024 15:30:00 +0100 + +python-h5netcdf (1.4.0-1) unstable; urgency=medium + + * Team upload. + * New upstream release. + - Build-Depends: python3-packaging + * use pybuild build_dir to build docs + * ignore pycache dir in doc build if it wasn't created + * add debian patches + - doc_no_sphinx_book_theme.patch disables use of sphinx-book-theme + in docs. sphinx-book-theme is essentially unusable, requiring a + specific nodejs version which is not available. + - test_h5py_TypeError.patch catches h5py exception as TypeError + not KeyError. See upstream Issue#236. + * remove generated test files after buildtime testing + * Standards-Version: 4.7.0 + * debian/tests: run tests with and without internet access (ros3) + + -- Drew Parsons Fri, 08 Nov 2024 14:18:45 +0100 + python-h5netcdf (1.3.0-1) unstable; urgency=medium * Team upload. diff --git a/debian/control b/debian/control index 3426070..ed108ee 100644 --- a/debian/control +++ b/debian/control @@ -7,14 +7,14 @@ Build-Depends: debhelper-compat (= 13), dh-python, pybuild-plugin-pyproject, python3-all, - python3-sphinx-book-theme , python3-h5py , python3-netcdf4 , + python3-packaging, python3-pytest , python3-setuptools, python3-setuptools-scm, python3-sphinx -Standards-Version: 4.6.2 +Standards-Version: 4.7.2 Vcs-Browser: https://salsa.debian.org/science-team/python-h5netcdf Vcs-Git: https://salsa.debian.org/science-team/python-h5netcdf.git Homepage: https://github.com/shoyer/h5netcdf diff --git a/debian/patches/doc_no_sphinx_book_theme.patch b/debian/patches/doc_no_sphinx_book_theme.patch new file mode 100644 index 0000000..c476f30 --- /dev/null +++ b/debian/patches/doc_no_sphinx_book_theme.patch @@ -0,0 +1,13 @@ +Index: python-h5netcdf/doc/conf.py +=================================================================== +--- python-h5netcdf.orig/doc/conf.py 2024-11-08 13:48:46.635215013 +0100 ++++ python-h5netcdf/doc/conf.py 2024-11-08 13:49:16.371504301 +0100 +@@ -72,7 +72,7 @@ + # The theme to use for HTML and HTML Help pages. See the documentation for + # a list of builtin themes. + # +-html_theme = "sphinx_book_theme" ++#html_theme = "sphinx_book_theme" + html_title = f"h5netcdf - {release}" + + html_baseurl = "https://h5netcdf.org" diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 0000000..d389611 --- /dev/null +++ b/debian/patches/series @@ -0,0 +1 @@ +doc_no_sphinx_book_theme.patch diff --git a/debian/rules b/debian/rules index 69281ed..977a921 100755 --- a/debian/rules +++ b/debian/rules @@ -6,6 +6,8 @@ export PYBUILD_NAME=h5netcdf export PYBUILD_TEST_ARGS=-v -k "not test_ros3" +PY3DEF = $(shell py3versions -vd) + %: dh $@ --with python3,sphinxdoc --buildsystem=pybuild @@ -14,6 +16,9 @@ execute_after_dh_clean: execute_after_dh_auto_build: ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS))) - PYTHONPATH=. sphinx-build -b html -N doc/ $(CURDIR)/.pybuild/docs/html - rm -r $(CURDIR)/.pybuild/docs/html/_static/__pycache__ + PYTHONPATH=`pybuild -p $(PY3DEF) --print "{build_dir}"` sphinx-build -b html -N doc/ $(CURDIR)/.pybuild/docs/html + rm -rf $(CURDIR)/.pybuild/docs/html/_static/__pycache__ endif + +execute_after_dh_auto_test: + rm -rf .pybuild/*/build/test.nc diff --git a/debian/tests/control b/debian/tests/control index 27e51b4..4176e46 100644 --- a/debian/tests/control +++ b/debian/tests/control @@ -3,7 +3,16 @@ Test-Command: set -e ; for py in $(py3versions -r 2>/dev/null) ; do cd "$AUTOPKGTEST_TMP" ; echo "Testing with $py:" - ; $py -m pytest -v tests + ; $py -m pytest -v tests -k "not ros3" + ; done +Depends: @, python3-all, python3-netcdf4, python3-pytest + +Test-Command: set -e + ; cp -a h5netcdf/tests "$AUTOPKGTEST_TMP" + ; for py in $(py3versions -r 2>/dev/null) + ; do cd "$AUTOPKGTEST_TMP" + ; echo "Testing with $py:" + ; $py -m pytest -v tests -k ros3 ; done Depends: @, python3-all, python3-netcdf4, python3-pytest Restrictions: needs-internet diff --git a/doc/api.rst b/doc/api.rst index 4ee0983..a8bc75e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -18,3 +18,6 @@ This page provides an auto-generated summary of h5netcdf's new API. Group Dimension Variable + CompoundType + EnumType + VLType diff --git a/doc/conf.py b/doc/conf.py index 2001c9d..0b3317a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -130,7 +130,7 @@ # extract git revision gh_tree_name = version_tuple[-1].split(".")[0][1:] -rel = "`{0} <{1}/h5netcdf/tree/{2}>`__".format(release, url, gh_tree_name) +rel = f"`{release} <{url}/h5netcdf/tree/{gh_tree_name}>`__" rst_epilog = "" rst_epilog += f""" diff --git a/doc/devguide.rst b/doc/devguide.rst index 7b14736..e79b616 100644 --- a/doc/devguide.rst +++ b/doc/devguide.rst @@ -15,14 +15,18 @@ Contributors - `Brett Naul `_ - `Dion Häfner `_ - `Drew Parsons `_ +- `Ezequiel Cimadevilla Alvarez `_ - `Frédéric Laliberté `_ - `Ghislain Vaillant `_ +- `John Readey `_ - `Lion Krischer `_ - `Mark Harfouche `_ - `Martin Raspaud `_ - `Pierre Augier `_ +- `Rickard Holmberg `_ - `Ryan Grout `_ - `Scott Henderson `_ +- `Thomas Kluyver `_ - `Tom Augspurger `_ If you are interested to contribute, just let us know by creating an issue or pull request on github. diff --git a/doc/index.rst b/doc/index.rst index 0ed67e1..983aac1 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -50,7 +50,7 @@ by Stephan Hoyer. The first `official` ``h5netcdf`` announcement was made by Ste `xarray issue tracker`_ only one day later. The library evolved constantly over the years (fixing bugs and adding enhancements) -and gained contributions from 15 other :ref:`contributors` so far. The library is widely used, +and gained contributions from 19 other :ref:`contributors` so far. The library is widely used, especially as backend within `xarray`_. Early 2020 Kai Mühlbauer started to add contributions and after some time he volunteered diff --git a/doc/legacyapi.rst b/doc/legacyapi.rst index 4a17fcc..9254ee2 100644 --- a/doc/legacyapi.rst +++ b/doc/legacyapi.rst @@ -18,3 +18,6 @@ This page provides an auto-generated summary of h5netcdf's legacy API. Group Dimension Variable + CompoundType + EnumType + VLType diff --git a/h5netcdf.egg-info/PKG-INFO b/h5netcdf.egg-info/PKG-INFO index f93f473..bbccf9b 100644 --- a/h5netcdf.egg-info/PKG-INFO +++ b/h5netcdf.egg-info/PKG-INFO @@ -1,6 +1,6 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.2 Name: h5netcdf -Version: 1.3.0 +Version: 1.6.1 Summary: netCDF4 via h5py Author-email: Stephan Hoyer , Kai Mühlbauer Maintainer-email: h5netcdf developers @@ -45,6 +45,7 @@ Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 Classifier: Topic :: Scientific/Engineering Requires-Python: >=3.9 Description-Content-Type: text/x-rst @@ -138,32 +139,32 @@ design is an adaptation of h5py to the netCDF data model. For example: import h5netcdf import numpy as np - with h5netcdf.File('mydata.nc', 'w') as f: + with h5netcdf.File("mydata.nc", "w") as f: # set dimensions with a dictionary - f.dimensions = {'x': 5} + f.dimensions = {"x": 5} # and update them with a dict-like interface # f.dimensions['x'] = 5 # f.dimensions.update({'x': 5}) - v = f.create_variable('hello', ('x',), float) + v = f.create_variable("hello", ("x",), float) v[:] = np.ones(5) # you don't need to create groups first # you also don't need to create dimensions first if you supply data # with the new variable - v = f.create_variable('/grouped/data', ('y',), data=np.arange(10)) + v = f.create_variable("/grouped/data", ("y",), data=np.arange(10)) # access and modify attributes with a dict-like interface - v.attrs['foo'] = 'bar' + v.attrs["foo"] = "bar" # you can access variables and groups directly using a hierarchical # keys like h5py - print(f['/grouped/data']) + print(f["/grouped/data"]) # add an unlimited dimension - f.dimensions['z'] = None + f.dimensions["z"] = None # explicitly resize a dimension and all variables using it - f.resize_dimension('z', 3) + f.resize_dimension("z", 3) Notes: @@ -184,22 +185,23 @@ The legacy API is designed for compatibility with `netCDF4-python`_. To use it, .. code-block:: python import h5netcdf.legacyapi as netCDF4 + # everything here would also work with this instead: # import netCDF4 import numpy as np - with netCDF4.Dataset('mydata.nc', 'w') as ds: - ds.createDimension('x', 5) - v = ds.createVariable('hello', float, ('x',)) + with netCDF4.Dataset("mydata.nc", "w") as ds: + ds.createDimension("x", 5) + v = ds.createVariable("hello", float, ("x",)) v[:] = np.ones(5) - g = ds.createGroup('grouped') - g.createDimension('y', 10) - g.createVariable('data', 'i8', ('y',)) - v = g['data'] + g = ds.createGroup("grouped") + g.createDimension("y", 10) + g.createVariable("data", "i8", ("y",)) + v = g["data"] v[:] = np.arange(10) - v.foo = 'bar' - print(ds.groups['grouped'].variables['data']) + v.foo = "bar" + print(ds.groups["grouped"].variables["data"]) The legacy API is designed to be easy to try-out for netCDF4-python users, but it is not an exact match. Here is an incomplete list of functionality we don't include: @@ -222,9 +224,6 @@ h5py implements some features that do not (yet) result in valid netCDF files: - Data types: - Booleans - - Complex values - - Non-string variable length types - - Enum types - Reference types - Arbitrary filters: - Scale-offset filters @@ -239,11 +238,11 @@ when creating a file: .. code-block:: python # avoid the .nc extension for non-netcdf files - f = h5netcdf.File('mydata.h5', invalid_netcdf=True) + f = h5netcdf.File("mydata.h5", invalid_netcdf=True) ... # works with the legacy API, too, though compression options are not exposed - ds = h5netcdf.legacyapi.Dataset('mydata.h5', invalid_netcdf=True) + ds = h5netcdf.legacyapi.Dataset("mydata.h5", invalid_netcdf=True) ... In such cases the `_NCProperties` attribute will not be saved to the file or be removed @@ -281,7 +280,7 @@ phony dimensions according to `netCDF`_ behaviour. .. code-block:: python # mimic netCDF-behaviour for non-netcdf files - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='sort') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="sort") ... Note, that this iterates once over the whole group-hierarchy. This has affects @@ -292,7 +291,7 @@ to group access time. The created phony dimension naming will differ from .. code-block:: python - f = h5netcdf.File('mydata.h5', mode='r', phony_dims='access') + f = h5netcdf.File("mydata.h5", mode="r", phony_dims="access") ... .. rubric:: Footnotes diff --git a/h5netcdf/__init__.py b/h5netcdf/__init__.py index 8f3c2bc..28bbee0 100644 --- a/h5netcdf/__init__.py +++ b/h5netcdf/__init__.py @@ -5,6 +5,7 @@ A Python library for the netCDF4 file-format that directly reads and writes HDF5 files via h5py, without using the Unidata netCDF library. """ + try: from ._version import version as __version__ except Exception: diff --git a/h5netcdf/_version.py b/h5netcdf/_version.py index adba37d..b1b0b71 100644 --- a/h5netcdf/_version.py +++ b/h5netcdf/_version.py @@ -1,8 +1,13 @@ -# file generated by setuptools_scm +# file generated by setuptools-scm # don't change, don't track in version control + +__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"] + TYPE_CHECKING = False if TYPE_CHECKING: - from typing import Tuple, Union + from typing import Tuple + from typing import Union + VERSION_TUPLE = Tuple[Union[int, str], ...] else: VERSION_TUPLE = object @@ -12,5 +17,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '1.3.0' -__version_tuple__ = version_tuple = (1, 3, 0) +__version__ = version = '1.6.1' +__version_tuple__ = version_tuple = (1, 6, 1) diff --git a/h5netcdf/attrs.py b/h5netcdf/attrs.py index 4549ad7..326a0db 100644 --- a/h5netcdf/attrs.py +++ b/h5netcdf/attrs.py @@ -76,7 +76,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if key in _HIDDEN_ATTRS: - raise AttributeError("cannot write attribute with reserved name %r" % key) + raise AttributeError(f"cannot write attribute with reserved name {key!r}") if hasattr(value, "dtype"): dtype = value.dtype else: @@ -98,4 +98,4 @@ def __len__(self): return len(self._h5attrs) - hidden_count def __repr__(self): - return "\n".join(["%r" % type(self)] + [f"{k}: {v!r}" for k, v in self.items()]) + return "\n".join([f"{type(self)!r}"] + [f"{k}: {v!r}" for k, v in self.items()]) diff --git a/h5netcdf/core.py b/h5netcdf/core.py index 7f506fa..fa72369 100644 --- a/h5netcdf/core.py +++ b/h5netcdf/core.py @@ -1,5 +1,5 @@ # For details on how netCDF4 builds on HDF5: -# http://www.unidata.ucar.edu/software/netcdf/docs/file_format_specifications.html#netcdf_4_spec +# https://docs.unidata.ucar.edu/netcdf-c/current/file_format_specifications.html#netcdf_4_spec import os.path import warnings import weakref @@ -54,9 +54,11 @@ def _transform_1d_boolean_indexers(key): # return key, if not iterable try: key = [ - np.asanyarray(k).nonzero()[0] - if isinstance(k, (np.ndarray, list)) and type(k[0]) in (bool, np.bool_) - else k + ( + np.asanyarray(k).nonzero()[0] + if isinstance(k, (np.ndarray, list)) and type(k[0]) in (bool, np.bool_) + else k + ) for k in key ] except TypeError: @@ -106,13 +108,11 @@ def _expanded_indexer(key, ndim): return key[k1] + res_dims + key[k2] -class BaseVariable: - def __init__(self, parent, name, dimensions=None): +class BaseObject: + def __init__(self, parent, name): self._parent_ref = weakref.ref(parent) self._root_ref = weakref.ref(parent._root) self._h5path = _join_h5paths(parent.name, name) - self._dimensions = dimensions - self._initialized = True @property def _parent(self): @@ -128,11 +128,152 @@ def _h5ds(self): # subclasses: return self._root._h5file[self._h5path] + @property + def name(self): + """Return object name.""" + return self._h5ds.name + + @property + def dtype(self): + """Return NumPy dtype giving object’s dtype.""" + return self._h5ds.dtype + + +_h5type_mapping = { + "H5T_INTEGER": 0, + "H5T_FLOAT": 1, + "H5T_STRING": 3, + "H5T_COMPOUND": 6, + "H5T_ENUM": 8, + "H5T_VLEN": 9, +} + + +def _get_h5usertype_identifier(h5type): + """Return H5 Type Identifier from given H5 Datatype.""" + try: + # h5py first + h5typeid = h5type.id.get_class() + except AttributeError: + # h5pyd second + h5typeid = _h5type_mapping[h5type.id.type_json["class"]] + return h5typeid + + +def _get_h5dstype_identifier(h5type): + """Return H5 Type Identifier from given H5 Dataset.""" + try: + # h5py first + h5typeid = h5type.id.get_type().get_class() + except AttributeError: + # h5pyd second + h5typeid = _h5type_mapping[h5type.id.type_json["class"]] + return h5typeid + + +class UserType(BaseObject): + _cls_name = "h5netcdf.UserType" + + @property + def name(self): + """Return user type name.""" + # strip hdf5 path + return super().name.split("/")[-1] + + def __repr__(self): + if self._parent._root._closed: + return f"" + header = f" 1 + ) or enum_info: + value = fillvalue + else: + value = self.dtype.type(fillvalue) + + self.attrs["_FillValue"] = value + @property def dimensions(self): """Return variable dimension names.""" @@ -268,16 +434,52 @@ def shape(self): @property def ndim(self): - """Return number variable dimensions""" + """Return number of variable dimensions.""" return len(self.shape) def __len__(self): return self.shape[0] @property - def dtype(self): - """Return NumPy dtype object giving the variable’s type.""" - return self._h5ds.dtype + def _h5type_identifier(self): + """Returns type identifier. + + See https://api.h5py.org/h5t.html#datatype-class-codes and + https://docs.hdfgroup.org (enum H5T_class_t) + + """ + return _get_h5dstype_identifier(self._h5ds) + + @property + def _h5datatype(self): + """Returns comparable h5type. + + This property can be used to compare two variables/datatypes or + a variable and a datatype for equality of the underlying datatype. + + - DatatypeID for h5py + - (dtype, dtype.metadata) for h5pyd + """ + if self._root._h5py.__name__ == "h5py": + return self._h5ds.id.get_type() + else: + return self.dtype, self.dtype.metadata + + @property + def datatype(self): + """Return datatype. + + Returns numpy dtype (for primitive types) or VLType/CompoundType/EnumType + instance (for compound, vlen or enum data types). + """ + # this is really painful as we have to iterate over all types + # and check equality + usertype = self._parent._get_usertype_dict(self._h5type_identifier) + if usertype is not None: + for tid in usertype.values(): + if self._h5datatype == tid._h5datatype: + return tid + return self.dtype def _get_padding(self, key): """Return padding if needed, defaults to False.""" @@ -334,28 +536,54 @@ def __getitem__(self, key): # get padding padding = self._get_padding(key) + # apply padding with fillvalue (both api) if padding: fv = self.dtype.type(self._h5ds.fillvalue) - return np.pad( + h5ds = np.pad( self._h5ds, pad_width=padding, mode="constant", constant_values=fv, - )[key] + ) + else: + h5ds = self._h5ds - return self._h5ds[key] + if ( + isinstance(self.datatype, CompoundType) + and (view := self.datatype.dtype_view) is not None + ): + return h5ds[key].view(view) + else: + return h5ds[key] def __setitem__(self, key, value): from .legacyapi import Dataset + # check if provided values match enumtype values + if enum_dict := self._root._h5py.check_enum_dtype(self.dtype): + mask = np.isin(value, list(enum_dict.values())) + wrong = set(np.asanyarray(value)[~mask]) + if not mask.all(): + raise ValueError( + f"Trying to assign illegal value(s) {wrong!r} to Enum variable {self.name!r}." + f" Valid values are {dict(enum_dict)!r}." + ) + if isinstance(self._parent._root, Dataset): # resize on write only for legacyapi key = _expanded_indexer(key, self.ndim) key = _transform_1d_boolean_indexers(key) # resize on write only for legacy API self._maybe_resize_dimensions(key, value) - self._h5ds[key] = value + + if ( + isinstance(self.datatype, CompoundType) + and (view := _string_to_char_array_dtype(self.datatype.dtype)) is not None + ): + self._h5ds[key] = value.view(view) + else: + self._h5ds[key] = value @property def attrs(self): @@ -368,14 +596,8 @@ def attrs(self): def __repr__(self): if self._parent._root._closed: - return "" % self._cls_name - header = "<{} {!r}: dimensions {}, shape {}, dtype {}>".format( - self._cls_name, - self.name, - self.dimensions, - self.shape, - self.dtype, - ) + return f"" + header = f"<{self._cls_name} {self.name!r}: dimensions {self.dimensions}, shape {self.shape}, dtype {self.dtype}>" return "\n".join( [header] + ["Attributes:"] @@ -386,6 +608,11 @@ def __repr__(self): class Variable(BaseVariable): @property def chunks(self): + if self.shape == (): + # In HSDS, the layout can be chunked even for scalar datasets, but with only a single chunk. + # Return None for scalar datasets since they shall be handled as non-chunked. + assert self._h5ds.chunks in (None, (), (1,)) + return None return self._h5ds.chunks @property @@ -467,9 +694,164 @@ def _unlabeled_dimension_mix(h5py_dataset): return status +def _check_dtype(group, dtype): + """Check and handle dtypes when adding variable to given group. + + Raises errors and issues warnings according to given dtype. + """ + + if dtype == np.bool_: + # never warn since h5netcdf has always errored here + _invalid_netcdf_feature( + "boolean dtypes", + group._root.invalid_netcdf, + ) + else: + group._root._check_valid_netcdf_dtype(dtype) + + # we only allow h5netcdf user types, not named h5py.Datatype + if isinstance(dtype, group._root._h5py.Datatype): + raise TypeError( + f"Argument dtype {dtype!r} is not allowed. " + f"Please provide h5netcdf user type or numpy compatible type." + ) + + # is user type is given extract underlying h5py object + # we just use the h5py user type here + if isinstance(dtype, (EnumType, VLType, CompoundType)): + h5type = dtype._h5ds + if dtype._root._h5file.filename != group._root._h5file.filename: + raise TypeError( + f"Given dtype {dtype} is not committed into current file" + f" {group._root._h5file.filename}. Instead it's committed into" + f" file {dtype._root._h5file.filename}" + ) + # check if committed type can be accessed in current group hierarchy + user_type = group._get_usertype(h5type) + if user_type is None: + msg = ( + f"Given dtype {dtype.name!r} is not accessible in current group" + f" {group._h5group.name!r} or any parent group. Instead it's defined at" + f" {h5type.name!r}. Please create it in the current or any parent group." + ) + raise TypeError(msg) + # this checks for committed types which are overridden by re-definitions + elif (actual := user_type._h5ds.name) != h5type.name: + msg = ( + f"Given dtype {dtype.name!r} is defined at {h5type.name!r}." + f" Another dtype with same name is defined at {actual!r} and" + f" would override it." + ) + raise TypeError(msg) + elif np.dtype(dtype).kind == "c": + itemsize = np.dtype(dtype).itemsize + try: + width = {8: "FLOAT", 16: "DOUBLE"}[itemsize] + except KeyError as e: + raise TypeError( + "Currently only 'complex64' and 'complex128' dtypes are allowed." + ) from e + dname = f"_PFNC_{width}_COMPLEX_TYPE" + # todo check compound type for existing complex types + # which may be used here + # if dname is not available in current group-path + # create and commit type in current group + if dname not in group._all_cmptypes: + dtype = group.create_cmptype(dtype, dname).dtype + + return dtype + + +def _check_fillvalue(group, fillvalue, dtype): + """Handles fillvalues when adding variable to given group. + + Raises errors and issues warnings according to + given fillvalue and dtype. + """ + + # handling default fillvalues for legacyapi + # see https://github.com/h5netcdf/h5netcdf/issues/182 + from .legacyapi import Dataset, _get_default_fillvalue + + stacklevel = 5 if isinstance(group._root, Dataset) else 4 + + h5fillvalue = fillvalue + + # if no fillvalue is provided take netcdf4 default values for legacyapi + if fillvalue is None: + if isinstance(group._root, Dataset): + h5fillvalue = _get_default_fillvalue(dtype) + + # handling for EnumType + if dtype is not None and isinstance(dtype, EnumType): + if fillvalue is None: + # 1. we need to warn the user that writing enums with default values + # which are defined in the enum dict will mask those values + if (h5fillvalue or 0) in dtype.enum_dict.values(): + reverse = dict((v, k) for k, v in dtype.enum_dict.items()) + msg = ( + f"Creating variable with default fill_value {h5fillvalue or 0!r}" + f" which IS defined in enum type {dtype!r}." + f" This will mask entry {{{reverse[h5fillvalue or 0]!r}: {h5fillvalue or 0!r}}}." + ) + warnings.warn(msg, stacklevel=stacklevel) + else: + # 2. we need to raise if the default fillvalue is not within the enum dict + if ( + h5fillvalue is not None + and h5fillvalue not in dtype.enum_dict.values() + ): + msg = ( + f"Creating variable with default fill_value {h5fillvalue!r}" + f" which IS NOT defined in enum type {dtype!r}." + f" Please provide a fitting fill_value or enum type." + ) + raise ValueError(msg) + if h5fillvalue is None and 0 not in dtype.enum_dict.values(): + # 3. we should inform the user that a fillvalue of '0' + # will be interpreted as _UNDEFINED in netcdf-c + # if it is not defined in the enum dict + msg = ( + f"Creating variable with default fill_value {0!r}" + f" which IS NOT defined in enum type {dtype!r}." + f" Value {0!r} will be interpreted as '_UNDEFINED' by netcdf-c." + ) + warnings.warn(msg, stacklevel=stacklevel) + else: + if h5fillvalue not in dtype.enum_dict.values(): + # 4. we should inform the user that a fillvalue of '0' + # will be interpreted as _UNDEFINED in netcdf-c + # if it is not defined in the enum dict + if h5fillvalue == 0: + msg = ( + f"Creating variable with specified fill_value {h5fillvalue!r}" + f" which IS NOT defined in enum type {dtype!r}." + f" Value {0!r} will be interpreted as '_UNDEFINED' by netcdf-c." + ) + warnings.warn(msg, stacklevel=stacklevel) + # 5. we need to raise if the fillvalue is not within the enum_dict + else: + msg = ( + f"Creating variable with specified fill_value {h5fillvalue!r}" + f" which IS NOT defined in enum type {dtype!r}." + f" Please provide a matching fill_value or enum type." + ) + raise ValueError(msg) + + if fillvalue is not None: + # cast to wanted type + fillvalue = np.array(h5fillvalue).astype(dtype) + h5fillvalue = fillvalue + + return fillvalue, h5fillvalue + + class Group(Mapping): _variable_cls = Variable _dimension_cls = Dimension + _enumtype_cls = EnumType + _vltype_cls = VLType + _cmptype_cls = CompoundType @property def _group_cls(self): @@ -486,13 +868,23 @@ def __init__(self, parent, name): self._h5path = _join_h5paths(parent._h5path, name) self._dimensions = Dimensions(self) + self._enumtypes = _LazyObjectLookup(self, self._enumtype_cls) + self._vltypes = _LazyObjectLookup(self, self._vltype_cls) + self._cmptypes = _LazyObjectLookup(self, self._cmptype_cls) # this map keeps track of all dimensions if parent is self: self._all_dimensions = ChainMap(self._dimensions) + self._all_enumtypes = ChainMap(self._enumtypes) + self._all_vltypes = ChainMap(self._vltypes) + self._all_cmptypes = ChainMap(self._cmptypes) + else: self._all_dimensions = parent._all_dimensions.new_child(self._dimensions) self._all_h5groups = parent._all_h5groups.new_child(self._h5group) + self._all_enumtypes = parent._all_enumtypes.new_child(self._enumtypes) + self._all_vltypes = parent._all_vltypes.new_child(self._vltypes) + self._all_cmptypes = parent._all_cmptypes.new_child(self._cmptypes) self._variables = _LazyObjectLookup(self, self._variable_cls) self._groups = _LazyObjectLookup(self, self._group_cls) @@ -506,6 +898,9 @@ def __init__(self, parent, name): # add to the groups collection if this is a h5py(d) Group # instance self._groups.add(k) + elif isinstance(v, self._root._h5py.Datatype): + # add usertypes (enum, vlen, compound) + self._add_usertype(v) else: if v.attrs.get("CLASS") == b"DIMENSION_SCALE": # add dimension and retrieve size @@ -587,19 +982,18 @@ def dimensions(self, value): for k, v in self._all_dimensions.maps[0].items(): if k in value: if v != value[k]: - raise ValueError("cannot modify existing dimension %r" % k) + raise ValueError(f"cannot modify existing dimension {k:!r}") else: raise ValueError( - "new dimensions do not include existing dimension %r" % k + f"new dimensions do not include existing dimension {k:!r}" ) self._dimensions.update(value) def _create_child_group(self, name): if name in self: - raise ValueError("unable to create group %r (name already exists)" % name) + raise ValueError(f"unable to create group {name:!r} (name already exists)") kwargs = {} - if self._root._h5py.__name__ == "h5py": - kwargs.update(track_order=self._track_order) + kwargs.update(track_order=self._track_order) self._h5group.create_group(name, **kwargs) self._groups[name] = self._group_cls(self, name) @@ -641,7 +1035,7 @@ def _create_child_variable( ): if name in self: raise ValueError( - "unable to create variable %r " "(name already exists)" % name + f"unable to create variable {name:!r} (name already exists)" ) if data is not None: data = np.asarray(data) @@ -649,14 +1043,8 @@ def _create_child_variable( if dtype is None: dtype = data.dtype - if dtype == np.bool_: - # never warn since h5netcdf has always errored here - _invalid_netcdf_feature( - "boolean dtypes", - self._root.invalid_netcdf, - ) - else: - self._root._check_valid_netcdf_dtype(dtype) + # check and handle dtypes + dtype = _check_dtype(self, dtype) if "scaleoffset" in kwargs: _invalid_netcdf_feature( @@ -708,8 +1096,8 @@ def _create_child_variable( pass else: raise ValueError( - "got unrecognized value %s for chunking_heuristic argument " - '(has to be "h5py" or "h5netcdf")' % chunking_heuristic + f"got unrecognized value {chunking_heuristic} for chunking_heuristic argument " + '(has to be "h5py" or "h5netcdf")' ) # Clear dummy HDF5 datasets with this name that were created for a @@ -721,16 +1109,10 @@ def _create_child_variable( self._dimensions[name]._detach_scale() del self._h5group[name] - if self._root._h5py.__name__ == "h5py": - kwargs.update(dict(track_order=self._parent._track_order)) - - # handling default fillvalues for legacyapi - # see https://github.com/h5netcdf/h5netcdf/issues/182 - from .legacyapi import Dataset, _get_default_fillvalue + kwargs.update(dict(track_order=self._parent._track_order)) - fillval = fillvalue - if fillvalue is None and isinstance(self._parent._root, Dataset): - fillval = _get_default_fillvalue(dtype) + # fill value handling + fillvalue, h5fillvalue = _check_fillvalue(self, fillvalue, dtype) # create hdf5 variable self._h5group.create_dataset( @@ -739,7 +1121,7 @@ def _create_child_variable( dtype=dtype, data=data, chunks=chunks, - fillvalue=fillval, + fillvalue=h5fillvalue, **kwargs, ) @@ -768,22 +1150,10 @@ def _create_child_variable( # Todo: get this consistent with netcdf-c/netcdf4-python variable._ensure_dim_id() + # add fillvalue attribute to variable if fillvalue is not None: - # trying to create correct type of fillvalue - if variable.dtype is str: - value = fillvalue - else: - string_info = self._root._h5py.check_string_dtype(variable.dtype) - if ( - string_info - and string_info.length is not None - and string_info.length > 1 - ): - value = fillvalue - else: - value = variable.dtype.type(fillvalue) + variable._add_fillvalue(fillvalue) - variable.attrs._h5attrs["_FillValue"] = value return variable def create_variable( @@ -807,8 +1177,8 @@ def create_variable( dimensions : tuple Tuple containing dimension name strings. Defaults to empty tuple, effectively creating a scalar variable. - dtype : numpy.dtype, str, optional - Dataype of the new variable. Defaults to None. + dtype : numpy.dtype, str, UserType (Enum, VL, Compound), optional + Datatype of the new variable. Defaults to None. fillvalue : scalar, optional Specify fillvalue for uninitialized parts of the variable. Defaults to ``None``. chunks : tuple, optional @@ -818,9 +1188,9 @@ def create_variable( ``h5netcdf``. Discussion on ``h5netcdf`` chunking can be found in (:issue:`52`) and (:pull:`127`). compression : str, optional - Compression filter to apply, defaults to ``gzip`` + Compression filter to apply, defaults to ``gzip``. ``zlib`` is an alias for ``gzip``. compression_opts : int - Parameter for compression filter. For ``compression="gzip"`` Integer from 1 to 9 specifying + Parameter for compression filter. For ``compression="gzip"``/``compression="zlib"`` Integer from 1 to 9 specifying the compression level. Defaults to 4. fletcher32 : bool If ``True``, HDF5 Fletcher32 checksum algorithm is applied. Defaults to ``False``. @@ -841,6 +1211,7 @@ def create_variable( var : h5netcdf.Variable Variable class instance """ + # if root-variable if name.startswith("/"): # handling default fillvalues for legacyapi @@ -866,6 +1237,13 @@ def create_variable( group = self for k in keys[:-1]: group = group._require_child_group(k) + + # Allow zlib to be an alias for gzip + # but use getters and setters so as not to change the behavior + # of the default h5py functions + if kwargs.get("compression", None) == "zlib": + kwargs["compression"] = "gzip" + return group._create_child_variable( keys[-1], dimensions, @@ -918,6 +1296,48 @@ def groups(self): def variables(self): return Frozen(self._variables) + def _add_usertype(self, h5type): + """Add usertype to related usertype dict. + + The type is added by name to the dict attached to current group. + """ + name = h5type.name.split("/")[-1] + h5typeid = _get_h5usertype_identifier(h5type) + # add usertype to corresponding dict + self._get_usertype_dict(h5typeid).maps[0].add(name) + + def _get_usertype(self, h5type): + """Get usertype from related usertype dict.""" + h5typeid = _get_h5usertype_identifier(h5type) + return self._get_usertype_dict(h5typeid).get(h5type.name.split("/")[-1]) + + def _get_usertype_dict(self, h5typeid): + """Return usertype-dict related to given h5 type identifier. + + See https://api.h5py.org/h5t.html#datatype-class-codes and + https://docs.hdfgroup.org (enum H5T_class_t) + """ + return { + 6: self._all_cmptypes, + 8: self._all_enumtypes, + 9: self._all_vltypes, + }.get(h5typeid) + + @property + def enumtypes(self): + """Return group defined enum types.""" + return Frozen(self._enumtypes) + + @property + def vltypes(self): + """Return group defined vlen types.""" + return Frozen(self._vltypes) + + @property + def cmptypes(self): + """Return group defined compound types.""" + return Frozen(self._cmptypes) + @property def dims(self): return Frozen(self._dimensions) @@ -936,9 +1356,11 @@ def _repr_body(self): + [ " {}: {}".format( k, - f"Unlimited (current: {self._dimensions[k].size})" - if v is None - else v, + ( + f"Unlimited (current: {self._dimensions[k].size})" + if v is None + else v + ), ) for k, v in self.dimensions.items() ] @@ -967,6 +1389,59 @@ def resize_dimension(self, dim, size): """ self._dimensions[dim]._resize(size) + def create_enumtype(self, datatype, datatype_name, enum_dict): + """Create EnumType. + + datatype: np.dtype + A numpy integer dtype object describing the base type for the Enum. + datatype_name: string + A Python string containing a description of the Enum data type. + enum_dict: dict + A Python dictionary containing the Enum field/value pairs. + """ + et = self._root._h5py.enum_dtype(enum_dict, basetype=datatype) + self._h5group[datatype_name] = et + # create enumtype class instance + enumtype = self._enumtype_cls(self, datatype_name) + self._enumtypes[datatype_name] = enumtype + return enumtype + + def create_vltype(self, datatype, datatype_name): + """Create VLType. + + datatype: np.dtype + A numpy dtype object describing the base type. + datatype_name: string + A Python string containing a description of the VL data type. + """ + # wrap in numpy dtype first + datatype = np.dtype(datatype) + et = self._root._h5py.vlen_dtype(datatype) + self._h5group[datatype_name] = et + # create vltype class instance + vltype = self._vltype_cls(self, datatype_name) + self._vltypes[datatype_name] = vltype + return vltype + + def create_cmptype(self, datatype, datatype_name): + """Create CompoundType. + + datatype: np.dtype + A numpy dtype object describing the structured type. + datatype_name: string + A Python string containing a description of the compound data type. + """ + # wrap in numpy dtype first + datatype = np.dtype(datatype) + if (new_dtype := _string_to_char_array_dtype(datatype)) is not None: + # "SN" -> ("S1", (N,)) + datatype = new_dtype + self._h5group[datatype_name] = datatype + # create compound class instance + cmptype = self._cmptype_cls(self, datatype_name) + self._cmptypes[datatype_name] = cmptype + return cmptype + class File(Group): def __init__(self, path, mode="r", invalid_netcdf=False, phony_dims=None, **kwargs): @@ -975,7 +1450,8 @@ def __init__(self, path, mode="r", invalid_netcdf=False, phony_dims=None, **kwar Parameters ---------- path: path-like - Location of the netCDF4 file to be accessed. + Location of the netCDF4 file to be accessed, or an h5py File object, + or a Python file-like object (which should read/write bytes). mode: "r", "r+", "a", "w" A valid file access mode. Defaults to "r". @@ -1010,6 +1486,10 @@ def __init__(self, path, mode="r", invalid_netcdf=False, phony_dims=None, **kwar Datasets created with h5netcdf version 0.12.0 that are opened with newer versions of h5netcdf will continue to disable order tracker. + + If an h5py File object is passed in, closing the h5netcdf wrapper will + not close the h5py File. In other cases, closing the h5netcdf File object + does close the underlying file. """ # 2022/01/09 # netCDF4 wants the track_order parameter to be true @@ -1027,9 +1507,10 @@ def __init__(self, path, mode="r", invalid_netcdf=False, phony_dims=None, **kwar track_order = kwargs.pop("track_order", track_order_default) self.decode_vlen_strings = kwargs.pop("decode_vlen_strings", None) + self._close_h5file = True try: if isinstance(path, str): - if ( + if kwargs.get("driver") == "h5pyd" or ( path.startswith(("http://", "https://", "hdf5://")) and "driver" not in kwargs ): @@ -1054,6 +1535,12 @@ def __init__(self, path, mode="r", invalid_netcdf=False, phony_dims=None, **kwar self._h5file = self._h5py.File( path, mode, track_order=track_order, **kwargs ) + elif isinstance(path, h5py.File): + self._preexisting_file = mode in {"r", "r+", "a"} + self._h5py = h5py + self._h5file = path + # h5py File passed in: let the caller decide when to close it + self._close_h5file = False else: # file-like object self._preexisting_file = mode in {"r", "r+", "a"} self._h5py = h5py @@ -1129,16 +1616,10 @@ def create_phony_dimensions(grp): def _check_valid_netcdf_dtype(self, dtype): dtype = np.dtype(dtype) - if dtype == bool: + if dtype == bool: # noqa description = "boolean" - elif dtype == complex: - description = "complex" - elif h5py.check_dtype(enum=dtype) is not None: - description = "enum" - elif h5py.check_dtype(ref=dtype) is not None: + elif self._h5py.check_dtype(ref=dtype) is not None: description = "reference" - elif h5py.check_dtype(vlen=dtype) not in {None, str, bytes}: - description = "non-string variable length" else: description = None @@ -1168,15 +1649,14 @@ def flush(self): if self._writable: # only write `_NCProperties` in newly created files if not self._preexisting_file and not self.invalid_netcdf: - _NC_PROPERTIES = "version=2,h5netcdf={},hdf5={},{}={}".format( - __version__, - self._h5py.version.hdf5_version, - self._h5py.__name__, - self._h5py.__version__, + _NC_PROPERTIES = ( + f"version=2,h5netcdf={__version__}," + f"hdf5={self._h5py.version.hdf5_version}," + f"{self._h5py.__name__}={self._h5py.__version__}" ) self.attrs._h5attrs["_NCProperties"] = np.array( _NC_PROPERTIES, - dtype=h5py.string_dtype( + dtype=self._h5py.string_dtype( encoding="ascii", length=len(_NC_PROPERTIES) ), ) @@ -1200,7 +1680,9 @@ def flush(self): def close(self): if not self._closed: self.flush() - self._h5file.close() + if self._close_h5file: + self._h5file.close() + self._h5file = None self._closed = True __del__ = close @@ -1215,11 +1697,9 @@ def __exit__(self, type, value, traceback): def __repr__(self): if self._closed: - return "" % self._cls_name - header = "<{} {!r} (mode {})>".format( - self._cls_name, - self.filename.split("/")[-1], - self.mode, + return f"" + header = ( + f"<{self._cls_name} {os.path.basename(self.filename)!r} (mode {self.mode})>" ) return "\n".join([header] + self._repr_body()) diff --git a/h5netcdf/dimensions.py b/h5netcdf/dimensions.py index 5936d28..1e3ba98 100644 --- a/h5netcdf/dimensions.py +++ b/h5netcdf/dimensions.py @@ -2,7 +2,6 @@ from collections import OrderedDict from collections.abc import MutableMapping -import h5py import numpy as np @@ -23,7 +22,7 @@ def __setitem__(self, name, size): if not self._group._root._writable: raise RuntimeError("H5NetCDF: Write to read only") if name in self._objects: - raise ValueError("dimension %r already exists" % name) + raise ValueError(f"dimension {name:!r} already exists") self._objects[name] = Dimension(self._group, name, size, create_h5ds=True) @@ -48,9 +47,8 @@ def __len__(self): def __repr__(self): if self._group._root._closed: return "" - return "" % ", ".join( - f"{k}={v!r}" for k, v in self._objects.items() - ) + dims = ", ".join(f"{k}={v!r}" for k, v in self._objects.items()) + return f"" def _join_h5paths(parent_path, child_path): @@ -138,7 +136,7 @@ def _h5ds(self): @property def _isscale(self): - return h5py.h5ds.is_scale(self._h5ds.id) + return self._root._h5py.h5ds.is_scale(self._h5ds.id) @property def _dimid(self): @@ -151,8 +149,7 @@ def _resize(self, size): if not self.isunlimited(): raise ValueError( - "Dimension '%s' is not unlimited and thus cannot be resized." - % self.name + f"Dimension '{self.name}' is not unlimited and thus cannot be resized." ) self._h5ds.resize((size,)) @@ -234,7 +231,7 @@ def __len__(self): def __repr__(self): if not self._phony and self._parent._root._closed: - return "" % self._cls_name + return f"" special = "" if self._phony: special += " (phony_dim)" diff --git a/h5netcdf/legacyapi.py b/h5netcdf/legacyapi.py index 868060e..6843bff 100644 --- a/h5netcdf/legacyapi.py +++ b/h5netcdf/legacyapi.py @@ -41,8 +41,7 @@ def _check_return_dtype_endianess(endian="native"): pass else: raise ValueError( - "'endian' keyword argument must be 'little','big' or 'native', got '%s'" - % endian + f"'endian' keyword argument must be 'little','big' or 'native', got '{endian}'" ) return endianess @@ -105,22 +104,44 @@ def filters(self): @property def dtype(self): - """Return netCDF4.Variable datatype.""" + """Return netCDF4.Variable numpy dtype.""" dt = self._h5ds.dtype if h5py.check_dtype(vlen=dt) is str: return str return dt +class EnumType(core.EnumType): + _cls_name = "h5netcdf.legacyapi.EnumType" + + +class VLType(core.VLType): + _cls_name = "h5netcdf.legacyapi.VLType" + + +class CompoundType(core.CompoundType): + _cls_name = "h5netcdf.legacyapi.CompoundType" + + +class UserType(core.UserType): + _cls_name = "h5netcdf.legacyapi.UserType" + + class Group(core.Group, HasAttributesMixin): _cls_name = "h5netcdf.legacyapi.Group" _variable_cls = Variable + _enumtype_cls = EnumType + _vltype_cls = VLType + _cmptype_cls = CompoundType @property def _group_cls(self): return Group createGroup = core.Group.create_group + createEnumType = core.Group.create_enumtype + createVLType = core.Group.create_vltype + createCompoundType = core.Group.create_cmptype def createDimension(self, name, size): """Creates a new dimension with given name and size. @@ -161,8 +182,8 @@ def createVariable( varname : str Name of the new variable. If given as a path, intermediate groups will be created, if not existent. - datatype : numpy.dtype, str - Dataype of the new variable + datatype : numpy.dtype, str, UserType (Enum, VL, Compound) + Datatype of the new variable. dimensions : tuple Tuple containing dimension name strings. Defaults to empty tuple, effectively creating a scalar variable. diff --git a/h5netcdf/tests/conftest.py b/h5netcdf/tests/conftest.py index 068ce8a..a027b76 100644 --- a/h5netcdf/tests/conftest.py +++ b/h5netcdf/tests/conftest.py @@ -1,5 +1,4 @@ import os -import sys import tempfile from pathlib import Path from shutil import rmtree @@ -7,7 +6,7 @@ import pytest try: - from h5pyd._apps.hstouch import main as hstouch + from h5pyd import Folder from hsds.hsds_app import HsdsApp with_reqd_pkgs = True @@ -15,166 +14,51 @@ with_reqd_pkgs = False -def set_hsds_root(): - """Make required HSDS root directory.""" - hsds_root = Path(os.environ["ROOT_DIR"]) / os.environ["BUCKET_NAME"] / "home" - if hsds_root.exists(): - rmtree(hsds_root) - - old_sysargv = sys.argv - sys.argv = [""] - sys.argv.extend(["-e", os.environ["HS_ENDPOINT"]]) - sys.argv.extend(["-u", "admin"]) - sys.argv.extend(["-p", "admin"]) - sys.argv.extend(["--bucket", os.environ["BUCKET_NAME"]]) - sys.argv.append("/home/") - hstouch() - - sys.argv = [""] - sys.argv.extend(["-e", os.environ["HS_ENDPOINT"]]) - sys.argv.extend(["-u", "admin"]) - sys.argv.extend(["-p", "admin"]) - sys.argv.extend(["--bucket", os.environ["BUCKET_NAME"]]) - sys.argv.extend(["-o", os.environ["HS_USERNAME"]]) - sys.argv.append(f'/home/{os.environ["HS_USERNAME"]}/') - hstouch() - sys.argv = old_sysargv - - @pytest.fixture(scope="session") def hsds_up(): """Provide HDF Highly Scalabale Data Service (HSDS) for h5pyd testing.""" if with_reqd_pkgs: root_dir = Path(tempfile.mkdtemp(prefix="tmp-hsds-root-")) - os.environ["BUCKET_NAME"] = "data" - (root_dir / os.getenv("BUCKET_NAME")).mkdir(parents=True, exist_ok=True) - os.environ["ROOT_DIR"] = str(root_dir) - os.environ["HS_USERNAME"] = "h5netcdf-pytest" - os.environ["HS_PASSWORD"] = "TestEarlyTestEverything" + bucket_name = "pytest" + os.environ["BUCKET_NAME"] = bucket_name + os.mkdir( + f"{root_dir}/{bucket_name}" + ) # need to create a directory for our bucket - config = """allow_noauth: true -auth_expiration: -1 -default_public: False -aws_access_key_id: xxx -aws_secret_access_key: xxx -aws_iam_role: hsds_role -aws_region: us-east-1 -hsds_endpoint: http://hsds.hdf.test -aws_s3_gateway: null -aws_dynamodb_gateway: null -aws_dynamodb_users_table: null -azure_connection_string: null -azure_resource_id: null -azure_storage_account: null -azure_resource_group: null -root_dir: null -password_salt: null -bucket_name: hsdstest -head_port: 5100 -head_ram: 512m -dn_port: 6101 -dn_ram: 3g -sn_port: 5101 -sn_ram: 1g -rangeget_port: 6900 -rangeget_ram: 2g -target_sn_count: 0 -target_dn_count: 0 -log_level: INFO -log_timestamps: false -log_prefix: null -max_tcp_connections: 100 -head_sleep_time: 10 -node_sleep_time: 10 -async_sleep_time: 10 -s3_sync_interval: 1 -s3_sync_task_timeout: 10 -store_read_timeout: 1 -store_read_sleep_interval: 0.1 -max_pending_write_requests: 20 -flush_sleep_interval: 1 -max_chunks_per_request: 1000 -min_chunk_size: 1m -max_chunk_size: 4m -max_request_size: 100m -max_chunks_per_folder: 0 -max_task_count: 100 -max_tasks_per_node_per_request: 16 -aio_max_pool_connections: 64 -metadata_mem_cache_size: 128m -metadata_mem_cache_expire: 3600 -chunk_mem_cache_size: 128m -chunk_mem_cache_expire: 3600 -data_cache_size: 128m -data_cache_max_req_size: 128k -data_cache_expire_time: 3600 -data_cache_page_size: 4m -data_cache_max_concurrent_read: 16 -timeout: 30 -password_file: /config/passwd.txt -groups_file: /config/groups.txt -server_name: Highly Scalable Data Service (HSDS) -greeting: Welcome to HSDS! -about: HSDS is a webservice for HDF data -top_level_domains: [] -cors_domain: "*" -admin_user: admin -admin_group: null -openid_provider: azure -openid_url: null -openid_audience: null -openid_claims: unique_name,appid,roles -chaos_die: 0 -standalone_app: false -blosc_nthreads: 2 -http_compression: false -http_max_url_length: 512 -k8s_app_label: hsds -k8s_namespace: null -restart_policy: on-failure -domain_req_max_objects_limit: 500 -""" - tmp_dir = Path(tempfile.mkdtemp(prefix="tmp-hsds-")) - config_file = tmp_dir / "config.yml" - config_file.write_text(config) - passwd_file = tmp_dir / "passwd.txt" - passwd_file.write_text( - f'admin:admin\n{os.environ["HS_USERNAME"]}:{os.environ["HS_PASSWORD"]}\n' - ) - log_file = str(tmp_dir / "hsds.log") - tmp_dir = str(tmp_dir) - if sys.platform == "darwin": - # macOS temp directory paths can be very long and break low-level - # socket comms code... - socket_dir = "/tmp/hsds" - else: - socket_dir = tmp_dir + hs_username = "h5netcdf-pytest" + hs_password = "TestEarlyTestEverything" + + kwargs = {} + kwargs["username"] = hs_username + kwargs["password"] = hs_password + kwargs["root_dir"] = str(root_dir) + kwargs["logfile"] = f"{root_dir}/hsds.log" + kwargs["log_level"] = "DEBUG" + kwargs["host"] = "localhost" + kwargs["sn_port"] = 5101 try: - hsds = HsdsApp( - username=os.environ["HS_USERNAME"], - password=os.environ["HS_PASSWORD"], - password_file=str(passwd_file), - log_level=os.getenv("LOG_LEVEL", "DEBUG"), - logfile=log_file, - socket_dir=socket_dir, - config_dir=tmp_dir, - dn_count=2, - ) + hsds = HsdsApp(**kwargs) + hsds.run() is_up = hsds.ready if is_up: os.environ["HS_ENDPOINT"] = hsds.endpoint - set_hsds_root() + os.environ["HS_USERNAME"] = hs_username + os.environ["HS_PASSWORD"] = hs_password + # make folders expected by pytest + # pytest/home/h5netcdf-pytest + # Folder("/pytest/", mode='w') + Folder("/home/", mode="w") + Folder("/home/h5netcdf-pytest/", mode="w") except Exception: is_up = False yield is_up - + hsds.check_processes() # this will capture hsds log output hsds.stop() - rmtree(tmp_dir, ignore_errors=True) - rmtree(socket_dir, ignore_errors=True) + rmtree(root_dir, ignore_errors=True) else: diff --git a/h5netcdf/tests/test_h5netcdf.py b/h5netcdf/tests/test_h5netcdf.py index e24ec5f..066bcd8 100644 --- a/h5netcdf/tests/test_h5netcdf.py +++ b/h5netcdf/tests/test_h5netcdf.py @@ -15,7 +15,11 @@ import h5netcdf from h5netcdf import legacyapi -from h5netcdf.core import NOT_A_VARIABLE, CompatibilityError +from h5netcdf.core import ( + NOT_A_VARIABLE, + CompatibilityError, + VLType, +) try: import h5pyd @@ -103,18 +107,23 @@ def array_equal(a, b): def is_h5py_char_working(tmp_netcdf, name): - h5 = get_hdf5_module(tmp_netcdf) - # https://github.com/Unidata/netcdf-c/issues/298 - with h5.File(tmp_netcdf, "r") as ds: - v = ds[name] - try: - assert array_equal(v, _char_array) - return True - except Exception as e: - if re.match("^Can't read data", e.args[0]): - return False - else: - raise + if not isinstance(tmp_netcdf, h5py.File) and ( + without_h5pyd or not isinstance(tmp_netcdf, h5pyd.File) + ): + h5 = get_hdf5_module(tmp_netcdf) + # https://github.com/Unidata/netcdf-c/issues/298 + with h5.File(tmp_netcdf, "r") as ds: + return is_h5py_char_working(ds, name) + + v = tmp_netcdf[name] + try: + assert array_equal(v, _char_array) + return True + except Exception as e: + if re.match("^Can't read data", e.args[0]): + return False + else: + raise def write_legacy_netcdf(tmp_netcdf, write_module): @@ -164,17 +173,27 @@ def write_legacy_netcdf(tmp_netcdf, write_module): v = ds.createVariable("var_len_str", str, ("x")) v[0] = "foo" + enum_dict = dict(one=1, two=2, three=3, missing=255) + enum_type = ds.createEnumType(np.uint8, "enum_t", enum_dict) + v = ds.createVariable( + "enum_var", + enum_type, + ("x",), + fill_value=enum_dict["missing"], + ) + v[0:3] = [1, 2, 3] + ds.close() -def write_h5netcdf(tmp_netcdf): +def write_h5netcdf(tmp_netcdf, compression="gzip"): ds = h5netcdf.File(tmp_netcdf, "w") ds.attrs["global"] = 42 ds.attrs["other_attr"] = "yes" ds.dimensions = {"x": 4, "y": 5, "z": 6, "empty": 0, "unlimited": None} v = ds.create_variable( - "foo", ("x", "y"), float, chunks=(4, 5), compression="gzip", shuffle=True + "foo", ("x", "y"), float, chunks=(4, 5), compression=compression, shuffle=True ) v[...] = 1 v.attrs["units"] = "meters" @@ -214,6 +233,13 @@ def write_h5netcdf(tmp_netcdf): v = ds.create_variable("var_len_str", ("x",), dtype=dt) v[0] = _vlen_string + enum_dict = dict(one=1, two=2, three=3, missing=255) + enum_type = ds.create_enumtype(np.uint8, "enum_t", enum_dict) + v = ds.create_variable( + "enum_var", ("x",), dtype=enum_type, fillvalue=enum_dict["missing"] + ) + v[0:3] = [1, 2, 3] + ds.close() @@ -231,6 +257,7 @@ def read_legacy_netcdf(tmp_netcdf, read_module, write_module): ) assert set(ds.variables) == set( [ + "enum_var", "foo", "y", "z", @@ -242,6 +269,8 @@ def read_legacy_netcdf(tmp_netcdf, read_module, write_module): ] ) + assert set(ds.enumtypes) == set(["enum_t"]) + assert set(ds.groups) == set(["subgroup"]) assert ds.parent is None v = ds.variables["foo"] @@ -325,6 +354,12 @@ def read_legacy_netcdf(tmp_netcdf, read_module, write_module): assert v.shape == (10,) assert "y" in ds.groups["subgroup"].dimensions + enum_dict = dict(one=1, two=2, three=3, missing=255) + enum_type = ds.enumtypes["enum_t"] + assert enum_type.enum_dict == enum_dict + v = ds.variables["enum_var"] + assert array_equal(v, np.ma.masked_equal([1, 2, 3, 255], 255)) + ds.close() @@ -342,6 +377,7 @@ def read_h5netcdf(tmp_netcdf, write_module, decode_vlen_strings): ) variables = set( [ + "enum_var", "foo", "z", "intscalar", @@ -418,7 +454,7 @@ def read_h5netcdf(tmp_netcdf, write_module, decode_vlen_strings): assert list(v.attrs) == [] v = ds["var_len_str"] - assert h5py.check_dtype(vlen=v.dtype) == str + assert h5py.check_dtype(vlen=v.dtype) is str if getattr(ds, "decode_vlen_strings", True): assert v[0] == _vlen_string else: @@ -440,6 +476,12 @@ def read_h5netcdf(tmp_netcdf, write_module, decode_vlen_strings): assert ds["/subgroup/y_var"].shape == (10,) assert ds["/subgroup"].dimensions["y"].size == 10 + enum_dict = dict(one=1, two=2, three=3, missing=255) + enum_type = ds.enumtypes["enum_t"] + assert enum_type.enum_dict == enum_dict + v = ds.variables["enum_var"] + assert array_equal(v, np.ma.masked_equal([1, 2, 3, 255], 255)) + ds.close() @@ -475,6 +517,11 @@ def test_roundtrip_h5netcdf(tmp_local_or_remote_netcdf, decode_vlen_strings): read_h5netcdf(tmp_local_or_remote_netcdf, h5netcdf, decode_vlen_strings) +def test_write_compression_as_zlib(tmp_local_netcdf): + write_h5netcdf(tmp_local_netcdf, compression="zlib") + read_legacy_netcdf(tmp_local_netcdf, netCDF4, h5netcdf) + + def test_write_netCDF4_read_h5netcdf(tmp_local_netcdf, decode_vlen_strings): write_legacy_netcdf(tmp_local_netcdf, netCDF4) read_h5netcdf(tmp_local_netcdf, netCDF4, decode_vlen_strings) @@ -494,6 +541,16 @@ def test_fileobj(decode_vlen_strings): read_h5netcdf(fileobj, h5netcdf, decode_vlen_strings) +def test_h5py_file_obj(tmp_local_netcdf, decode_vlen_strings): + with h5py.File(tmp_local_netcdf, "w") as h5py_f: + write_h5netcdf(h5py_f) + read_h5netcdf(h5py_f, h5netcdf, decode_vlen_strings) + + # The h5py File object should still be open & usable, although the + # h5netcdf file object has been closed. + assert isinstance(h5py_f["foo"], h5py.Dataset) + + def test_repr(tmp_local_or_remote_netcdf): write_h5netcdf(tmp_local_or_remote_netcdf) f = h5netcdf.File(tmp_local_or_remote_netcdf, "a") @@ -645,8 +702,6 @@ def check_invalid_netcdf4(var, i): def test_invalid_netcdf4(tmp_local_or_remote_netcdf): - if tmp_local_or_remote_netcdf.startswith(remote_h5): - pytest.skip("netCDF4 package does not work with remote HDF5 files") h5 = get_hdf5_module(tmp_local_or_remote_netcdf) with h5.File(tmp_local_or_remote_netcdf, "w") as f: var, var2 = create_invalid_netcdf_data() @@ -714,8 +769,6 @@ def check_invalid_netcdf4_mixed(var, i): def test_invalid_netcdf4_mixed(tmp_local_or_remote_netcdf): - if tmp_local_or_remote_netcdf.startswith(remote_h5): - pytest.skip("netCDF4 package does not work with remote HDF5 files") h5 = get_hdf5_module(tmp_local_or_remote_netcdf) with h5.File(tmp_local_or_remote_netcdf, "w") as f: var, var2 = create_invalid_netcdf_data() @@ -790,15 +843,16 @@ def test_hierarchical_access_auto_create(tmp_local_or_remote_netcdf): ds.close() -def test_Netcdf4Dimid(tmp_local_netcdf): +def test_Netcdf4Dimid(tmp_local_or_remote_netcdf): # regression test for https://github.com/h5netcdf/h5netcdf/issues/53 - with h5netcdf.File(tmp_local_netcdf, "w") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as f: f.dimensions["x"] = 1 g = f.create_group("foo") g.dimensions["x"] = 2 g.dimensions["y"] = 3 - with h5py.File(tmp_local_netcdf, "r") as f: + h5 = get_hdf5_module(tmp_local_or_remote_netcdf) + with h5.File(tmp_local_or_remote_netcdf, "r") as f: # all dimension IDs should be present exactly once dim_ids = {f[name].attrs["_Netcdf4Dimid"] for name in ["x", "foo/x", "foo/y"]} assert dim_ids == {0, 1, 2} @@ -859,9 +913,6 @@ def test_failed_read_open_and_clean_delete(tmpdir): def test_create_variable_matching_saved_dimension(tmp_local_or_remote_netcdf): h5 = get_hdf5_module(tmp_local_or_remote_netcdf) - # if h5 is not h5py: - # pytest.xfail("https://github.com/h5netcdf/h5netcdf/issues/48") - with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as f: f.dimensions["x"] = 2 f.create_variable("y", data=[1, 2], dimensions=("x",)) @@ -885,11 +936,6 @@ def test_invalid_netcdf_error(tmp_local_or_remote_netcdf): f.create_variable( "lzf_compressed", data=[1], dimensions=("x"), compression="lzf" ) - # invalid - with pytest.raises(h5netcdf.CompatibilityError): - f.create_variable("complex", data=1j) - with pytest.raises(h5netcdf.CompatibilityError): - f.attrs["complex_attr"] = 1j with pytest.raises(h5netcdf.CompatibilityError): f.create_variable("scaleoffset", data=[1], dimensions=("x",), scaleoffset=0) @@ -1192,19 +1238,19 @@ def test_reading_special_datatype_created_with_c_api(tmp_local_netcdf): pass -def test_nc4_non_coord(tmp_local_netcdf): +def test_nc4_non_coord(tmp_local_or_remote_netcdf): # Here we generate a few variables and coordinates # The default should be to track the order of creation # Thus, on reopening the file, the order in which # the variables are listed should be maintained # y -- refers to the coordinate y # _nc4_non_coord_y -- refers to the data y - with h5netcdf.File(tmp_local_netcdf, "w") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as f: f.dimensions = {"x": None, "y": 2} f.create_variable("test", dimensions=("x",), dtype=np.int64) f.create_variable("y", dimensions=("x",), dtype=np.int64) - with h5netcdf.File(tmp_local_netcdf, "r") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as f: assert list(f.dimensions) == ["x", "y"] assert f.dimensions["x"].size == 0 assert f.dimensions["x"].isunlimited() @@ -1213,12 +1259,12 @@ def test_nc4_non_coord(tmp_local_netcdf): assert list(f.variables) == ["test", "y"] assert list(f._h5group.keys()) == ["x", "y", "test", "_nc4_non_coord_y"] - with h5netcdf.File(tmp_local_netcdf, "w") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as f: f.dimensions = {"x": None, "y": 2} f.create_variable("y", dimensions=("x",), dtype=np.int64) f.create_variable("test", dimensions=("x",), dtype=np.int64) - with h5netcdf.File(tmp_local_netcdf, "r") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as f: assert list(f.dimensions) == ["x", "y"] assert f.dimensions["x"].size == 0 assert f.dimensions["x"].isunlimited() @@ -1306,42 +1352,42 @@ def test_create_attach_scales_legacyapi(tmp_local_netcdf): create_attach_scales(tmp_local_netcdf, legacyapi) -def test_detach_scale(tmp_local_netcdf): - with h5netcdf.File(tmp_local_netcdf, "w") as ds: +def test_detach_scale(tmp_local_or_remote_netcdf): + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as ds: ds.dimensions["x"] = 2 ds.dimensions["y"] = 2 - with h5netcdf.File(tmp_local_netcdf, "a") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "a") as ds: ds.create_variable("test", dimensions=("x",), dtype=np.int64) - with h5netcdf.File(tmp_local_netcdf, "r") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: refs = ds._h5group["x"].attrs.get("REFERENCE_LIST", False) assert len(refs) == 1 for (ref, dim), name in zip(refs, ["/test"]): assert dim == 0 assert ds._root._h5file[ref].name == name - with h5netcdf.File(tmp_local_netcdf, "a") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "a") as ds: ds.dimensions["x"]._detach_scale() - with h5netcdf.File(tmp_local_netcdf, "r") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: refs = ds._h5group["x"].attrs.get("REFERENCE_LIST", False) assert not refs -def test_is_scale(tmp_local_netcdf): - with legacyapi.Dataset(tmp_local_netcdf, "w") as ds: +def test_is_scale(tmp_local_or_remote_netcdf): + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: ds.createDimension("x", 10) - with legacyapi.Dataset(tmp_local_netcdf, "r") as ds: + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: assert ds.dimensions["x"]._isscale -def test_get_dim_scale_refs(tmp_local_netcdf): - with legacyapi.Dataset(tmp_local_netcdf, "w") as ds: +def test_get_dim_scale_refs(tmp_local_or_remote_netcdf): + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: ds.createDimension("x", 10) ds.createVariable("test0", "i8", ("x",)) ds.createVariable("test1", "i8", ("x",)) - with legacyapi.Dataset(tmp_local_netcdf, "r") as ds: + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: refs = ds.dimensions["x"]._scale_refs assert ds._h5file[refs[0][0]] == ds["test0"]._h5ds assert ds._h5file[refs[1][0]] == ds["test1"]._h5ds @@ -1498,14 +1544,14 @@ def test_dimensions(tmp_local_netcdf, read_write_matrix): ) -def test_no_circular_references(tmp_local_netcdf): +def test_no_circular_references(tmp_local_or_remote_netcdf): # https://github.com/h5py/h5py/issues/2019 - with h5netcdf.File(tmp_local_netcdf, "w") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as ds: ds.dimensions["x"] = 2 ds.dimensions["y"] = 2 gc.collect() - with h5netcdf.File(tmp_local_netcdf, "r") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: refs = gc.get_referrers(ds) for ref in refs: print(ref) @@ -1713,13 +1759,13 @@ def test_group_names(tmp_local_netcdf): assert ds[name].name == name -def test_legacyapi_endianess(tmp_local_netcdf): +def test_legacyapi_endianess(tmp_local_or_remote_netcdf): # https://github.com/h5netcdf/h5netcdf/issues/15 big = legacyapi._check_return_dtype_endianess("big") little = legacyapi._check_return_dtype_endianess("little") native = legacyapi._check_return_dtype_endianess("native") - with legacyapi.Dataset(tmp_local_netcdf, "w") as ds: + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: ds.createDimension("x", 4) # test creating variable using endian keyword argument v = ds.createVariable("big", int, ("x"), endian="big") @@ -1729,25 +1775,27 @@ def test_legacyapi_endianess(tmp_local_netcdf): v = ds.createVariable("native", int, ("x"), endian="native") v[...] = 65535 - with h5py.File(tmp_local_netcdf, "r") as ds: + h5 = get_hdf5_module(tmp_local_or_remote_netcdf) + with h5.File(tmp_local_or_remote_netcdf, "r") as ds: assert ds["big"].dtype.byteorder == big assert ds["little"].dtype.byteorder == little assert ds["native"].dtype.byteorder == native - with h5netcdf.File(tmp_local_netcdf, "r") as ds: + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: assert ds["big"].dtype.byteorder == big assert ds["little"].dtype.byteorder == little assert ds["native"].dtype.byteorder == native - with legacyapi.Dataset(tmp_local_netcdf, "r") as ds: + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: assert ds["big"].dtype.byteorder == big assert ds["little"].dtype.byteorder == little assert ds["native"].dtype.byteorder == native - with netCDF4.Dataset(tmp_local_netcdf, "r") as ds: - assert ds["big"].dtype.byteorder == big - assert ds["little"].dtype.byteorder == little - assert ds["native"].dtype.byteorder == native + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + assert ds["big"].dtype.byteorder == big + assert ds["little"].dtype.byteorder == little + assert ds["native"].dtype.byteorder == native def test_bool_slicing_length_one_dim(tmp_local_netcdf): @@ -1781,9 +1829,9 @@ def test_bool_slicing_length_one_dim(tmp_local_netcdf): ds["hello"][bool_slice, :] -def test_fancy_indexing(tmp_local_netcdf): +def test_fancy_indexing(tmp_local_or_remote_netcdf): # regression test for https://github.com/pydata/xarray/issues/7154 - with h5netcdf.legacyapi.Dataset(tmp_local_netcdf, "w") as ds: + with h5netcdf.legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: ds.createDimension("x", None) ds.createDimension("y", None) ds.createVariable("hello", int, ("x", "y"), fill_value=0) @@ -1791,7 +1839,7 @@ def test_fancy_indexing(tmp_local_netcdf): ds.createVariable("hello2", int, ("x", "y")) ds["hello2"][:10, :20] = np.arange(10 * 20, dtype="int").reshape((10, 20)) - with legacyapi.Dataset(tmp_local_netcdf, "a") as ds: + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "a") as ds: np.testing.assert_array_equal(ds["hello"][1, [7, 8, 9]], [17, 18, 19]) np.testing.assert_array_equal(ds["hello"][1, [9, 10, 11]], [19, 0, 0]) np.testing.assert_array_equal(ds["hello"][1, slice(9, 12)], [19, 0, 0]) @@ -1877,9 +1925,9 @@ def test_h5netcdf_chunking(tmp_local_netcdf): assert chunks_h5netcdf == (5, 5, 5, 10) -def test_create_invalid_netcdf_catch_error(tmp_local_netcdf): +def test_create_invalid_netcdf_catch_error(tmp_local_or_remote_netcdf): # see https://github.com/h5netcdf/h5netcdf/issues/138 - with h5netcdf.File(tmp_local_netcdf, "w") as f: + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as f: try: f.create_variable("test", ("x", "y"), data=np.ones((10, 10), dtype="bool")) except CompatibilityError: @@ -2186,3 +2234,542 @@ def test_ros3(): f = h5netcdf.File(fname, "r", driver="ros3") assert "Temperature" in list(f) f.close() + + +def test_user_type_errors_new_api(tmp_local_or_remote_netcdf): + enum_dict1 = dict(one=1, two=2, three=3, missing=254) + enum_dict2 = dict(one=0, two=2, three=3, missing=255) + with h5netcdf.File("test.nc", "w") as ds0: + enum_type_ext = ds0.create_enumtype(np.uint8, "enum_t", enum_dict1) + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as ds: + ds.dimensions = {"enum_dim": 4} + g = ds.create_group("subgroup") + enum_type = ds.create_enumtype(np.uint8, "enum_t", enum_dict1) + + if tmp_local_or_remote_netcdf.startswith(remote_h5): + testcontext = pytest.raises(RuntimeError, match="Conflict") + else: + testcontext = pytest.raises( + (KeyError, TypeError), match="name already exists" + ) + with testcontext: + ds.create_enumtype(np.uint8, "enum_t", enum_dict2) + + enum_type2 = g.create_enumtype(np.uint8, "enum_t2", enum_dict2) + g.create_enumtype(np.uint8, "enum_t", enum_dict2) + with pytest.raises(TypeError, match="Please provide h5netcdf user type"): + ds.create_variable( + "enum_var1", + ("enum_dim",), + dtype=enum_type._h5ds, + fillvalue=enum_dict1["missing"], + ) + with pytest.raises(TypeError, match="is not committed into current file"): + ds.create_variable( + "enum_var2", + ("enum_dim",), + dtype=enum_type_ext, + fillvalue=enum_dict1["missing"], + ) + with pytest.raises(TypeError, match="is not accessible in current group"): + ds.create_variable( + "enum_var3", + ("enum_dim",), + dtype=enum_type2, + fillvalue=enum_dict2["missing"], + ) + with pytest.raises(TypeError, match="Another dtype with same name"): + g.create_variable( + "enum_var4", + ("enum_dim",), + dtype=enum_type, + fillvalue=enum_dict2["missing"], + ) + + +def test_user_type_errors_legacyapi(tmp_local_or_remote_netcdf): + enum_dict1 = dict(one=1, two=2, three=3, missing=254) + enum_dict2 = dict(one=0, two=2, three=3, missing=255) + with legacyapi.Dataset("test.nc", "w") as ds0: + enum_type_ext = ds0.createEnumType(np.uint8, "enum_t", enum_dict1) + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createDimension("enum_dim", 4) + g = ds.createGroup("subgroup") + enum_type = ds.createEnumType(np.uint8, "enum_t", enum_dict1) + if tmp_local_or_remote_netcdf.startswith(remote_h5): + testcontext = pytest.raises(RuntimeError, match="Conflict") + else: + testcontext = pytest.raises( + (KeyError, TypeError), match="name already exists" + ) + with testcontext: + ds.createEnumType(np.uint8, "enum_t", enum_dict1) + + enum_type2 = g.createEnumType(np.uint8, "enum_t2", enum_dict2) + g.create_enumtype(np.uint8, "enum_t", enum_dict2) + with pytest.raises(TypeError, match="Please provide h5netcdf user type"): + ds.createVariable( + "enum_var1", + enum_type._h5ds, + ("enum_dim",), + fill_value=enum_dict1["missing"], + ) + with pytest.raises(TypeError, match="is not committed into current file"): + ds.createVariable( + "enum_var2", + enum_type_ext, + ("enum_dim",), + fill_value=enum_dict1["missing"], + ) + with pytest.raises(TypeError, match="is not accessible in current group"): + ds.createVariable( + "enum_var3", + enum_type2, + ("enum_dim",), + fill_value=enum_dict2["missing"], + ) + with pytest.raises(TypeError, match="Another dtype with same name"): + g.createVariable( + "enum_var4", + enum_type, + ("enum_dim",), + fill_value=enum_dict2["missing"], + ) + + +def test_enum_type_errors_new_api(tmp_local_or_remote_netcdf): + enum_dict1 = dict(one=1, two=2, three=3, missing=254) + enum_dict2 = dict(one=0, two=2, three=3, missing=255) + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as ds: + ds.dimensions = {"enum_dim": 4} + enum_type = ds.create_enumtype(np.uint8, "enum_t", enum_dict1) + enum_type2 = ds.create_enumtype(np.uint8, "enum_t2", enum_dict2) + + # 1. + with pytest.warns(UserWarning, match="default fill_value 0 which IS defined"): + ds.create_variable( + "enum_var1", + ("enum_dim",), + dtype=enum_type2, + ) + # 2. is for legacyapi only + # 3. + with pytest.warns( + UserWarning, match="default fill_value 0 which IS NOT defined" + ): + ds.create_variable( + "enum_var2", + ("enum_dim",), + dtype=enum_type, + ) + # 4. + with pytest.warns( + UserWarning, match="with specified fill_value 0 which IS NOT" + ): + ds.create_variable( + "enum_var3", + ("enum_dim",), + dtype=enum_type, + fillvalue=0, + ) + # 5. + with pytest.raises( + ValueError, match="with specified fill_value 100 which IS NOT" + ): + ds.create_variable( + "enum_var4", + ("enum_dim",), + dtype=enum_type, + fillvalue=100, + ) + + +def test_enum_type_errors_legacyapi(tmp_local_or_remote_netcdf): + enum_dict1 = dict(one=1, two=2, three=3, missing=254) + enum_dict2 = dict(one=0, two=2, three=3, missing=255) + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createDimension("enum_dim", 4) + enum_type = ds.createEnumType(np.uint8, "enum_t", enum_dict1) + enum_type2 = ds.createEnumType(np.uint8, "enum_t2", enum_dict2) + + # 1. + with pytest.warns(UserWarning, match="default fill_value 255 which IS defined"): + ds.createVariable( + "enum_var1", + enum_type2, + ("enum_dim",), + ) + # 2. + with pytest.raises(ValueError, match="default fill_value 255 which IS NOT"): + ds.createVariable( + "enum_var2", + enum_type, + ("enum_dim",), + ) + # 3. is only for new api + # 4. + with pytest.warns( + UserWarning, match="interpreted as '_UNDEFINED' by netcdf-c." + ): + ds.createVariable( + "enum_var3", + enum_type, + ("enum_dim",), + fill_value=0, + ) + # 5. + with pytest.raises( + ValueError, match="with specified fill_value 100 which IS NOT" + ): + ds.createVariable("enum_var4", enum_type, ("enum_dim",), fill_value=100) + + +def test_enum_type(tmp_local_or_remote_netcdf): + # test EnumType + enum_dict = dict(one=1, two=2, three=3, missing=255) + enum_dict2 = dict(one=1, two=2, three=3, missing=254) + + # first with new API + with h5netcdf.File(tmp_local_or_remote_netcdf, "w") as ds: + ds.dimensions = {"enum_dim": 4} + ds.create_enumtype(np.uint8, "enum_t2", enum_dict2) + enum_type = ds.create_enumtype(np.uint8, "enum_t", enum_dict) + v = ds.create_variable( + "enum_var", ("enum_dim",), dtype=enum_type, fillvalue=enum_dict["missing"] + ) + v[0:3] = [1, 2, 3] + with pytest.raises(ValueError) as e: + v[3] = 5 + assert "assign illegal value(s)" in e.value.args[0] + + # check, if new API can read them + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + # check if legacyapi can read them + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + # check if netCDF4-python can read them + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var._FillValue == 255 + assert repr(enum_var.datatype) == repr(enum_type) + assert enum_var.datatype.name == "enum_t" + + # second with legacyapi + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createDimension("enum_dim", 4) + enum_type = ds.createEnumType(np.uint8, "enum_t", enum_dict) + v = ds.createVariable( + "enum_var", enum_type, ("enum_dim",), fill_value=enum_dict["missing"] + ) + v[0:3] = [1, 2, 3] + with pytest.raises(ValueError) as e: + v[3] = 5 + assert "assign illegal value(s)" in e.value.args[0] + + # check, if new API can read them + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + # check if legacyapi can read them + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + # check if netCDF4-python can read them + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var._FillValue == 255 + assert repr(enum_var.datatype) == repr(enum_type) + assert enum_var.datatype.name == "enum_t" + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + # third with netCDF4 api + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createDimension("enum_dim", 4) + enum_type = ds.createEnumType(np.uint8, "enum_t", enum_dict) + v = ds.createVariable( + "enum_var", enum_type, ("enum_dim",), fill_value=enum_dict["missing"] + ) + v[0:3] = [1, 2, 3] + with pytest.raises( + ValueError, match="assign illegal value to Enum variable" + ): + v[3] = 5 + + # check, if new API can read them + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + # check if legacyapi can read them + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var.attrs["_FillValue"] == 255 + assert enum_var.datatype == enum_type + assert enum_var.datatype.name == "enum_t" + + # check if netCDF4-python can read them + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + enum_type = ds.enumtypes["enum_t"] + enum_var = ds["enum_var"] + assert enum_type.enum_dict == enum_dict + assert array_equal(enum_var, np.ma.masked_equal([1, 2, 3, 255], 255)) + assert enum_var._FillValue == 255 + assert repr(enum_var.datatype) == repr(enum_type) + assert enum_var.datatype.name == "enum_t" + + +@pytest.mark.parametrize("dtype", ["int", "int8", "uint16", "float32", "int64"]) +def test_vltype_creation(tmp_local_or_remote_netcdf, netcdf_write_module, dtype): + # skip for netCDF4 writer for remote hsds files + if netcdf_write_module == netCDF4 and tmp_local_or_remote_netcdf.startswith( + remote_h5 + ): + pytest.skip() + + with netcdf_write_module.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createVLType(dtype, "vlen_t") + + with h5netcdf.File(tmp_local_or_remote_netcdf, "r") as ds: + vlen_type = ds.vltypes["vlen_t"] + assert isinstance(vlen_type, VLType) + assert h5py.check_vlen_dtype(vlen_type.dtype) == np.dtype(dtype) + assert vlen_type.name == "vlen_t" + + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + vlen_type = ds.vltypes["vlen_t"] + assert isinstance(vlen_type, legacyapi.VLType) + assert h5py.check_vlen_dtype(vlen_type.dtype) == np.dtype(dtype) + assert vlen_type.name == "vlen_t" + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + vlen_type = ds.vltypes["vlen_t"] + assert isinstance(vlen_type, netCDF4.VLType) + assert vlen_type.dtype == np.dtype(dtype) + assert vlen_type.name == "vlen_t" + + +def test_compoundtype_creation(tmp_local_or_remote_netcdf, netcdf_write_module): + # compound type is created with array of chars + compound = np.dtype( + [ + ("time", np.int32), + ("station_name", "S1", 10), + ("temperature", np.float32), + ("pressure", np.float32), + ] + ) + + # data is filled with fixed strings + compound2 = np.dtype( + [ + ("time", np.int32), + ("station_name", "S10"), + ("temperature", np.float32), + ("pressure", np.float32), + ] + ) + cmp_array = np.array( + [ + (0, *["Boulder"], 0.0, 0.0), + (1, *["New York"], 2.0, 3.0), + (2, *["Denver"], 4.0, 6.0), + (3, *["Washington"], 5.0, 7.0), + (4, *["Wachtberg"], 6.0, 8.0), + ], + dtype=compound2, + ) + if ( + netcdf_write_module.__name__ == "netCDF4" + and tmp_local_or_remote_netcdf.startswith(remote_h5) + ): + pytest.skip("does not work for netCDF4") + with netcdf_write_module.Dataset(tmp_local_or_remote_netcdf, "w") as ds: + ds.createDimension("x", 5) + ds.createGroup("test") + compound_t = ds.createCompoundType(compound, "cmp_t") + var = ds.createVariable("data", compound_t, ("x",)) + var[:] = cmp_array + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + cmptype = ds.cmptypes["cmp_t"] + assert isinstance(cmptype, netCDF4.CompoundType) + assert cmptype.name == "cmp_t" + assert array_equal(ds["data"][:], cmp_array) + assert ds["data"].datatype == cmptype.dtype + + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + cmptype = ds.cmptypes["cmp_t"] + assert isinstance(cmptype, h5netcdf.legacyapi.CompoundType) + assert cmptype.name == "cmp_t" + assert array_equal(ds["data"][:], cmp_array) + assert ds["data"].datatype == cmptype + assert ds["data"].dtype == cmptype.dtype + + +@pytest.mark.skipif( + version.parse(netCDF4.__version__) < version.parse("1.7.0"), + reason="does not work before netCDF4 v1.7.0", +) +def test_nc_complex_compatibility(tmp_local_or_remote_netcdf, netcdf_write_module): + if tmp_local_or_remote_netcdf.startswith(remote_h5): + pytest.skip("not yet implemented in h5pyd/hsds") + # native complex + complex_array = np.array([0 + 0j, 1 + 0j, 0 + 1j, 1 + 1j, 0.25 + 0.75j]) + # compound complex + complex128 = np.dtype( + { + "names": ["r", "i"], + "formats": ["f8", "f8"], + "offsets": [0, 8], + "itemsize": 16, + "aligned": True, + } + ) + cdata = np.array( + [(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (1.0, 1.0), (0.25, 0.75)], dtype=complex128 + ) + kwargs = {} + if ( + netcdf_write_module.__name__ == "netCDF4" + and tmp_local_or_remote_netcdf.startswith(remote_h5) + ): + pytest.skip("does not work for netCDF4") + + if netcdf_write_module.__name__ == "netCDF4": + kwargs.update(auto_complex=True) + with netcdf_write_module.Dataset(tmp_local_or_remote_netcdf, "w", **kwargs) as ds: + ds.createDimension("x", size=len(complex_array)) + var = ds.createVariable("data", "c16", ("x",)) + var[:] = complex_array + + with legacyapi.Dataset(tmp_local_or_remote_netcdf, "r") as ds: + dtype = ds.cmptypes["_PFNC_DOUBLE_COMPLEX_TYPE"] + assert isinstance(dtype, h5netcdf.legacyapi.CompoundType) + assert dtype.name == "_PFNC_DOUBLE_COMPLEX_TYPE" + assert array_equal(ds["data"][:], complex_array) + + if not tmp_local_or_remote_netcdf.startswith(remote_h5): + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r", auto_complex=True) as ds: + dtype = ds.cmptypes["_PFNC_DOUBLE_COMPLEX_TYPE"] + assert isinstance(dtype, netCDF4._netCDF4.CompoundType) + assert array_equal(ds["data"][:], complex_array) + + with netCDF4.Dataset(tmp_local_or_remote_netcdf, "r", auto_complex=False) as ds: + dtype = ds.cmptypes["_PFNC_DOUBLE_COMPLEX_TYPE"] + assert isinstance(dtype, netCDF4._netCDF4.CompoundType) + assert array_equal(ds["data"][:], cdata) + + +@pytest.mark.skipif( + version.parse(netCDF4.__version__) < version.parse("1.7.0"), + reason="does not work before netCDF4 v1.7.0", +) +def test_complex_type_creation_errors(tmp_local_netcdf): + complex_array = np.array([0 + 0j, 1 + 0j, 0 + 1j, 1 + 1j, 0.25 + 0.75j]) + + with legacyapi.Dataset(tmp_local_netcdf, "w") as ds: + ds.createDimension("x", size=len(complex_array)) + with pytest.raises(TypeError, match="data type 'c4' not understood"): + ds.createVariable("data", "c4", ("x",)) + + if "complex256" not in np.sctypeDict: + pytest.skip("numpy 'complex256' dtype not available") + with legacyapi.Dataset(tmp_local_netcdf, "w") as ds: + ds.createDimension("x", size=len(complex_array)) + with pytest.raises( + TypeError, + match="Currently only 'complex64' and 'complex128' dtypes are allowed.", + ): + ds.createVariable("data", "c32", ("x",)) + + +def test_hsds(hsds_up): + # test hsds setup/write + if without_h5pyd: + pytest.skip("h5pyd package not available") + elif not hsds_up: + pytest.skip("HSDS service not running") + rnd = "".join(random.choice(string.ascii_uppercase) for _ in range(5)) + fname = f"hdf5://testfile{rnd}.nc" + with h5netcdf.File(fname, "w") as ds: + g = ds.create_group("test") + g.dimensions["x"] = None + g.create_variable("var1", ("x",), dtype="i8") + + with h5netcdf.File(fname, "r") as ds: + print(ds["test"]["var1"]) + + +def test_h5pyd_driver(hsds_up): + # test that specifying driver='h5pyd' forces use of h5pyd + if without_h5pyd: + pytest.skip("h5pyd package not available") + elif not hsds_up: + pytest.skip("HSDS service not running") + rnd = "".join(random.choice(string.ascii_uppercase) for _ in range(5)) + for prefix in ("/", "hdf5://"): + fname = f"{prefix}testfile{rnd}.nc" + with h5netcdf.File(fname, "w", driver="h5pyd") as ds: + assert ds._h5py == h5pyd + assert isinstance(ds._h5file, h5pyd.File) + + +def test_h5pyd_nonchunked_scalars(hsds_up): + if without_h5pyd: + pytest.skip("h5pyd package not available") + elif not hsds_up: + pytest.skip("HSDS service not running") + rnd = "".join(random.choice(string.ascii_uppercase) for _ in range(5)) + fname = f"hdf5://testfile{rnd}.nc" + with h5pyd.File(fname, "w") as ds: + ds.create_dataset("foo", data=b"1234") + with h5netcdf.File(fname, "r", driver="h5pyd") as ds: + # HSDS stores this as a chunked dataset, but only with a single chunk + assert ds["foo"]._h5ds.chunks == (1,) + # However, since it is a scalar dataset, we should not expose the chunking + assert ds["foo"].chunks is None diff --git a/pyproject.toml b/pyproject.toml index 24df457..feeb384 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ classifiers=[ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering", ] dependencies = ["h5py", "packaging"] @@ -69,12 +70,12 @@ exclude = [ # E402: module level import not at top of file # E501: line too long - let black worry about that # E731: do not assign a lambda expression, use a def -ignore = [ +lint.ignore = [ "E402", "E501", "E731", ] -select = [ +lint.select = [ # Pyflakes "F", # Pycodestyle @@ -86,5 +87,5 @@ select = [ "UP", ] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["h5netcdf"]