Skip to content

Commit b396efe

Browse files
authored
Merge pull request #927 from davidhassell/h5netcdf-write-from-parallel-read
New 'h5netcdf-h5py' netCDF backend in `cf.write`, and 'h5py_options' parameter
2 parents 36613f4 + 8b2dbc7 commit b396efe

File tree

6 files changed

+148
-21
lines changed

6 files changed

+148
-21
lines changed

Changelog.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ Version NEXTVERSION
33

44
**2026-??-??**
55

6+
* New default backend for netCDF-4 in `cf.write`: ``h5netcdf-h5py``,
7+
that allows control of the internal file metadata via the new
8+
``h5py_options`` parameter
9+
(https://github.com/NCAS-CMS/cf-python/issues/924)
610
* New default backend for netCDF-4 in `cf.read` that allows parallel
711
reading: ``h5netcdf-pyfive``
812
(https://github.com/NCAS-CMS/cf-python/issues/912)
@@ -13,8 +17,6 @@ Version NEXTVERSION
1317
* Fix for subspacing with cyclic `cf.wi` and `cf.wo` arguments
1418
(https://github.com/NCAS-CMS/cf-python/issues/887)
1519

16-
----
17-
1820
Version 3.19.0
1921
--------------
2022

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ of its array manipulation and can:
8888
choice of netCDF backends,and in local, http, and s3 locations,
8989
* create new field constructs in memory,
9090
* write and append field and domain constructs to netCDF and Zarr v3
91-
datasets on disk,
91+
datasets on disk, with control over HDF5 internal file metadata,
9292
* read, create, and manipulate UGRID mesh topologies,
9393
* read, write, and create coordinates defined by geometry cells,
9494
* read netCDF and CDL datasets containing hierarchical groups,

cf/test/test_quantization.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import datetime
33
import faulthandler
44
import os
5+
import shutil
56
import tempfile
67
import unittest
78

@@ -20,6 +21,13 @@
2021
]
2122
[tmpfile1, tmpfile2] = tmpfiles
2223

24+
# Set up temporary directories
25+
tmpdirs = [
26+
tempfile.mkdtemp("_test_quantization.zarr", dir=os.getcwd())
27+
for i in range(1)
28+
]
29+
[tmpdir] = tmpdirs
30+
2331

2432
def _remove_tmpfiles():
2533
"""Remove temporary files created during tests."""
@@ -29,6 +37,13 @@ def _remove_tmpfiles():
2937
except OSError:
3038
pass
3139

40+
for d in tmpdirs:
41+
try:
42+
shutil.rmtree(d)
43+
os.rmdir(d)
44+
except OSError:
45+
pass
46+
3247

3348
atexit.register(_remove_tmpfiles)
3449

@@ -63,7 +78,7 @@ def test_quantization_read_write(self):
6378
f.set_quantize_on_write(q0)
6479

6580
# Write the field and read it back in
66-
cf.write(f, tmpfile1)
81+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
6782
g = cf.read(tmpfile1)[0]
6883

6984
# Check that f and g have different data (i.e. that
@@ -174,7 +189,7 @@ def test_quantization_write_exceptions(self):
174189
# digit_round
175190
f.set_quantize_on_write(algorithm="digitround", quantization_nsd=2)
176191
with self.assertRaises(ValueError):
177-
cf.write(f, tmpfile1)
192+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
178193

179194
# NetCDF3 formats
180195
for fmt in self.netcdf3_fmts:
@@ -184,29 +199,29 @@ def test_quantization_write_exceptions(self):
184199
# Integer data type
185200
f.data.dtype = int
186201
with self.assertRaises(ValueError):
187-
cf.write(f, tmpfile1)
202+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
188203

189204
# Out-of-range quantization_nsd
190205
f.data.dtype = "float32"
191206
f.set_quantize_on_write(algorithm="bitgroom", quantization_nsd=8)
192207
with self.assertRaises(ValueError):
193-
cf.write(f, tmpfile1)
208+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
194209

195210
f.data.dtype = "float64"
196211
f.set_quantize_on_write(algorithm="bitgroom", quantization_nsd=16)
197212
with self.assertRaises(ValueError):
198-
cf.write(f, tmpfile1)
213+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
199214

200215
# Out-of-range quantization_nsb
201216
f.data.dtype = "float32"
202217
f.set_quantize_on_write(algorithm="bitround", quantization_nsb=24)
203218
with self.assertRaises(ValueError):
204-
cf.write(f, tmpfile1)
219+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
205220

206221
f.data.dtype = "float64"
207222
f.set_quantize_on_write(algorithm="bitround", quantization_nsb=53)
208223
with self.assertRaises(ValueError):
209-
cf.write(f, tmpfile1)
224+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
210225

211226
def test_quantization_copy(self):
212227
"""Test that quantization information gets copied."""
@@ -222,6 +237,24 @@ def test_quantization_copy(self):
222237
g = f.copy()
223238
self.assertTrue(g.get_quantization().equals(q))
224239

240+
def test_quantization_backends(self):
241+
"""Test that quantization-on-write with different backends."""
242+
f = self.f1.copy()
243+
f.set_quantize_on_write(
244+
algorithm="granular_bitround", quantization_nsd=8
245+
)
246+
247+
# Backends that allow quantisation-on-write
248+
for backend in ("netCDF4",):
249+
cf.write(f, tmpfile1, netcdf_backend=backend)
250+
251+
# Backends that do not allow quantisation-on-write
252+
with self.assertRaises(NotImplementedError):
253+
cf.write(f, tmpdir, fmt="ZARR3", netcdf_backend="zarr")
254+
255+
with self.assertRaises(NotImplementedError):
256+
cf.write(f, tmpfile1, netcdf_backend="h5netcdf-h5py")
257+
225258

226259
if __name__ == "__main__":
227260
print("Run date:", datetime.datetime.now())

cf/test/test_read_write.py

Lines changed: 101 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,13 @@ def test_write_netcdf_mode(self):
352352
if ex_field_n in (8, 9, 10):
353353
continue
354354

355-
cf.write(ex_field, tmpfile, fmt=fmt, mode="a")
355+
cf.write(
356+
ex_field,
357+
tmpfile,
358+
fmt=fmt,
359+
mode="a",
360+
netcdf_backend="netCDF4",
361+
)
356362
f = cf.read(tmpfile)
357363

358364
if ex_field_n == 5: # another special case
@@ -433,15 +439,25 @@ def test_write_netcdf_mode(self):
433439
# field n=5 which aggregates to one with n=2] => + 1 - 1 = + 0:
434440
overall_length = len(append_ex_fields)
435441
cf.write(
436-
append_ex_fields, tmpfile, fmt=fmt, mode="a"
442+
append_ex_fields,
443+
tmpfile,
444+
fmt=fmt,
445+
mode="a",
446+
netcdf_backend="netCDF4",
437447
) # 2. now append
438448
f = cf.read(tmpfile)
439449
self.assertEqual(len(f), overall_length)
440450

441451
# Also test the mode="r+" alias for mode="a".
442-
cf.write(g, tmpfile, fmt=fmt, mode="w") # 1. overwrite to wipe
443452
cf.write(
444-
append_ex_fields, tmpfile, fmt=fmt, mode="r+"
453+
g, tmpfile, fmt=fmt, mode="w", netcdf_backend="netCDF4"
454+
) # 1. overwrite to wipe
455+
cf.write(
456+
append_ex_fields,
457+
tmpfile,
458+
fmt=fmt,
459+
mode="r+",
460+
netcdf_backend="netCDF4",
445461
) # 2. now append
446462
f = cf.read(tmpfile)
447463
self.assertEqual(len(f), overall_length)
@@ -543,8 +559,12 @@ def test_write_netcdf_mode(self):
543559
# )
544560

545561
# Check behaviour when append identical fields, as an edge case:
546-
cf.write(g, tmpfile, fmt=fmt, mode="w") # 1. overwrite to wipe
547-
cf.write(g_copy, tmpfile, fmt=fmt, mode="a") # 2. now append
562+
cf.write(
563+
g, tmpfile, fmt=fmt, mode="w", netcdf_backend="netCDF4"
564+
) # 1. overwrite to wipe
565+
cf.write(
566+
g_copy, tmpfile, fmt=fmt, mode="a", netcdf_backend="netCDF4"
567+
) # 2. now append
548568
f = cf.read(tmpfile)
549569
self.assertEqual(len(f), 2)
550570
self.assertTrue(
@@ -849,7 +869,7 @@ def test_write_omit_data(self):
849869
f = self.f1
850870
cf.write(f, tmpfile)
851871

852-
cf.write(f, tmpfile, omit_data="all")
872+
cf.write(f, tmpfile, omit_data="all", netcdf_backend="netCDF4")
853873
g = cf.read(tmpfile)
854874
self.assertEqual(len(g), 1)
855875
g = g[0]
@@ -861,7 +881,12 @@ def test_write_omit_data(self):
861881
# Check that a dump works
862882
g.dump(display=False)
863883

864-
cf.write(f, tmpfile, omit_data=("field", "dimension_coordinate"))
884+
cf.write(
885+
f,
886+
tmpfile,
887+
omit_data=("field", "dimension_coordinate"),
888+
netcdf_backend="netCDF4",
889+
)
865890
g = cf.read(tmpfile)[0]
866891

867892
# Check that only the field and dimension coordinate data are
@@ -870,7 +895,7 @@ def test_write_omit_data(self):
870895
self.assertFalse(np.ma.count(g.construct("grid_latitude").array))
871896
self.assertTrue(np.ma.count(g.construct("latitude").array))
872897

873-
cf.write(f, tmpfile, omit_data="field")
898+
cf.write(f, tmpfile, omit_data="field", netcdf_backend="netCDF4")
874899
g = cf.read(tmpfile)[0]
875900

876901
# Check that only the field data are missing
@@ -956,6 +981,73 @@ def test_read_zarr(self):
956981
z = cf.read(zarr_dataset, dataset_type="Zarr")
957982
self.assertEqual(len(z), 1)
958983

984+
def test_write_netcdf_backend(self):
985+
"""Test cf.write with different netCDF backends."""
986+
f = self.f0
987+
988+
cf.write(f, tmpfile0, netcdf_backend="h5netcdf-h5py")
989+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
990+
f0 = cf.read(tmpfile0)[0]
991+
f1 = cf.read(tmpfile1)[0]
992+
self.assertTrue(f1.equals(f0))
993+
994+
f = cf.read(filename)
995+
cf.write(f, tmpfile0, netcdf_backend="h5netcdf-h5py")
996+
cf.write(f, tmpfile1, netcdf_backend="netCDF4")
997+
f0 = cf.read(tmpfile0)[0]
998+
f1 = cf.read(tmpfile1)[0]
999+
self.assertTrue(f1.equals(f0))
1000+
1001+
# Bad fmt/backend combinations
1002+
for backend in ("netCDF4", "h5netcdf-h5py"):
1003+
with self.assertRaises(ValueError):
1004+
cf.write(f, tmpfile, fmt="ZARR3", netcdf_backend=backend)
1005+
1006+
for backend in ("zarr", "h5netcdf-h5py"):
1007+
with self.assertRaises(ValueError):
1008+
cf.write(
1009+
f, tmpfile, fmt="NETCDF3_CLASSIC", netcdf_backend=backend
1010+
)
1011+
1012+
for backend in ("zarr",):
1013+
with self.assertRaises(ValueError):
1014+
cf.write(f, tmpfile, fmt="NETCDF4", netcdf_backend=backend)
1015+
1016+
def test_write_h5py_options(self):
1017+
"""Test cf.write with h5py_options."""
1018+
f = self.f0
1019+
h5py_options = dict(
1020+
fs_strategy="page", fs_page_size=2**20, meta_block_size=500000
1021+
)
1022+
1023+
cf.write(f, tmpfile0, h5py_options=None)
1024+
size = os.path.getsize(tmpfile0)
1025+
1026+
cf.write(
1027+
f,
1028+
tmpfile1,
1029+
netcdf_backend="h5netcdf-h5py",
1030+
h5py_options=h5py_options,
1031+
)
1032+
self.assertTrue(os.path.getsize(tmpfile1) > size)
1033+
1034+
f0 = cf.read(tmpfile0)[0]
1035+
f1 = cf.read(tmpfile1)[0]
1036+
self.assertTrue(f1.equals(f0))
1037+
1038+
with self.assertRaises(ValueError):
1039+
cf.write(
1040+
f,
1041+
tmpfile0,
1042+
netcdf_backend="netCDF4",
1043+
h5py_options=h5py_options,
1044+
)
1045+
1046+
with self.assertRaises(ValueError):
1047+
cf.write(
1048+
f, tmpfile0, fmt="NETCDF3_CLASSIC", h5py_options=h5py_options
1049+
)
1050+
9591051
def test_read_netcdf_file(self):
9601052
"""Test cf.read for differing the netcdf_file backend."""
9611053
f = self.f0

docs/source/introduction.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ The `cf` package can:
7777
* create new field constructs in memory,
7878

7979
* write and append field and domain constructs to netCDF and Zarr
80-
datasets on disk,
80+
datasets on disk, with control over HDF5 internal file metadata,
8181

8282
* read, write, and manipulate UGRID mesh topologies,
8383

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def compile():
184184
185185
* create new field constructs in memory,
186186
187-
* write and append field and domain constructs to netCDF and Zarr v3 datasets on disk,
187+
* write and append field and domain constructs to netCDF and Zarr v3 datasets on disk, with control over HDF5 internal file metadata,
188188
189189
* read, write, and create coordinates defined by geometry cells,
190190

0 commit comments

Comments
 (0)