Skip to content

Commit a4d5656

Browse files
authored
Merge pull request #296 from onalant/shape
Add support for shapes in dtype definitions
2 parents 2bc8e69 + 5c95aaa commit a4d5656

File tree

7 files changed

+215
-25
lines changed

7 files changed

+215
-25
lines changed

docs/release.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,23 @@ Release notes
66
2.3.0 (Work in Progress)
77
------------------------
88

9+
Enhancements
10+
~~~~~~~~~~~~
11+
12+
* Support has been added for structured arrays with sub-array shape and/or nested fields. By
13+
:user:`Tarik Onalan <onalant>`, :issue:`111`, :issue:`296`.
14+
915
Maintenance
1016
~~~~~~~~~~~
1117

1218
* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and
13-
upgrade all package requirements. :issue:`308`.
19+
upgrade all pinned package requirements. :issue:`308`.
1420

21+
* Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams <ryan-williams>`,
22+
:issue:`273`, :issue:`308`.
23+
24+
Acknowledgments
25+
~~~~~~~~~~~~~~~
1526

1627
.. _release_2.2.0:
1728

docs/spec/v2.rst

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,31 @@ the `NumPy documentation on Datetimes and Timedeltas
140140
<https://docs.scipy.org/doc/numpy/reference/arrays.datetime.html#datetime-units>`_.
141141
For example, ``"<M8[ns]"`` specifies a datetime64 data type with nanosecond time units.
142142

143-
Structured data types (i.e., with multiple named fields) are encoded as a list
144-
of two-element lists, following `NumPy array protocol type descriptions (descr)
145-
<http://docs.scipy.org/doc/numpy/reference/arrays.interface.html#>`_. For
146-
example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a
147-
data type composed of three single-byte unsigned integers labelled "r", "g" and
148-
"b".
149-
143+
Structured data types (i.e., with multiple named fields) are encoded
144+
as a list of lists, following `NumPy array protocol type descriptions
145+
(descr)
146+
<http://docs.scipy.org/doc/numpy/reference/arrays.interface.html#>`_. Each
147+
sub-list has the form ``[fieldname, datatype, shape]`` where ``shape``
148+
is optional. ``fieldname`` is a string, ``datatype`` is a string
149+
specifying a simple data type (see above), and ``shape`` is a list of
150+
integers specifying subarray shape. For example, the JSON list below
151+
defines a data type composed of three single-byte unsigned integer
152+
fields named "r", "g" and "b"::
153+
154+
[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]
155+
156+
For example, the JSON list below defines a data type composed of three
157+
fields named "x", "y" and "z", where "x" and "y" each contain 32-bit
158+
floats, and each item in "z" is a 2 by 2 array of floats::
159+
160+
[["x", "<f4"], ["y", "<f4"], ["z", "<f4", [2, 2]]]
161+
162+
Structured data types may also be nested, e.g., the following JSON
163+
list defines a data type with two fields "foo" and "bar", where "bar"
164+
has two sub-fields "baz" and "qux"::
165+
166+
[["foo", "<f4"], ["bar", [["baz", "<f4"], ["qux", "<i4"]]]]
167+
150168
.. _spec_v2_array_fill_value:
151169

152170
Fill value encoding
@@ -512,6 +530,10 @@ initially published to clarify ambiguities and add some missing information.
512530
either arrays or groups, and if absent then custom attributes should be treated as
513531
empty.
514532

533+
* The specification now describes how structured datatypes with
534+
subarray shapes and/or with nested structured data types are encoded
535+
in array metadata (:issue:`111`, :issue:`296`).
536+
515537

516538
Changes from version 1 to version 2
517539
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

zarr/core.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1568,7 +1568,11 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection,
15681568
except KeyError:
15691569
# chunk not initialized
15701570
if self._fill_value is not None:
1571-
out[out_selection] = self._fill_value
1571+
if fields:
1572+
fill_value = self._fill_value[fields]
1573+
else:
1574+
fill_value = self._fill_value
1575+
out[out_selection] = fill_value
15721576

15731577
else:
15741578

zarr/meta.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,12 @@ def decode_array_metadata(s):
5454

5555
def encode_array_metadata(meta):
5656
dtype = meta['dtype']
57+
sdshape = ()
58+
if dtype.subdtype is not None:
59+
dtype, sdshape = dtype.subdtype
5760
meta = dict(
5861
zarr_format=ZARR_FORMAT,
59-
shape=meta['shape'],
62+
shape=meta['shape'] + sdshape,
6063
chunks=meta['chunks'],
6164
dtype=encode_dtype(dtype),
6265
compressor=meta['compressor'],
@@ -83,10 +86,9 @@ def _decode_dtype_descr(d):
8386
# recurse to handle nested structures
8487
if PY2: # pragma: py3 no cover
8588
# under PY2 numpy rejects unicode field names
86-
d = [(f.encode('ascii'), _decode_dtype_descr(v))
87-
for f, v in d]
89+
d = [(k[0].encode("ascii"), _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d]
8890
else: # pragma: py2 no cover
89-
d = [(f, _decode_dtype_descr(v)) for f, v in d]
91+
d = [(k[0], _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d]
9092
return d
9193

9294

zarr/storage.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,9 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa
329329
err_contains_group(path)
330330

331331
# normalize metadata
332-
shape = normalize_shape(shape)
333332
dtype, object_codec = normalize_dtype(dtype, object_codec)
333+
shape = normalize_shape(shape) + dtype.shape
334+
dtype = dtype.base
334335
chunks = normalize_chunks(chunks, shape, dtype.itemsize)
335336
order = normalize_order(order)
336337
fill_value = normalize_fill_value(fill_value, dtype)

zarr/tests/test_core.py

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -846,34 +846,90 @@ def test_nchunks_initialized(self):
846846
z[:] = 42
847847
assert 10 == z.nchunks_initialized
848848

849-
def test_structured_array(self):
849+
def test_array_dtype_shape(self):
850850

851+
dt = "(2, 2)f4"
851852
# setup some data
852-
d = np.array([(b'aaa', 1, 4.2),
853-
(b'bbb', 2, 8.4),
854-
(b'ccc', 3, 12.6)],
855-
dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])
853+
d = np.array([((0, 1),
854+
(1, 2)),
855+
((1, 2),
856+
(2, 3)),
857+
((2, 3),
858+
(3, 4))],
859+
dtype=dt)
860+
856861
for a in (d, d[:0]):
857-
for fill_value in None, b'', (b'zzz', 42, 16.8):
862+
for fill_value in None, 0:
863+
z = self.create_array(shape=a.shape[:-2], chunks=2, dtype=dt, fill_value=fill_value)
864+
assert len(a) == len(z)
865+
if fill_value is not None:
866+
assert fill_value == z.fill_value
867+
z[...] = a
868+
assert_array_equal(a, z[...])
869+
870+
def check_structured_array(self, d, fill_values):
871+
for a in (d, d[:0]):
872+
for fill_value in fill_values:
858873
z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value)
859874
assert len(a) == len(z)
875+
assert a.shape == z.shape
876+
assert a.dtype == z.dtype
877+
878+
# check use of fill value before array is initialised with data
860879
if fill_value is not None:
861880
if fill_value == b'':
862881
# numpy 1.14 compatibility
863882
np_fill_value = np.array(fill_value, dtype=a.dtype.str).view(a.dtype)[()]
864883
else:
865884
np_fill_value = np.array(fill_value, dtype=a.dtype)[()]
866885
assert np_fill_value == z.fill_value
867-
if len(z):
886+
if len(a):
868887
assert np_fill_value == z[0]
869888
assert np_fill_value == z[-1]
889+
empty = np.empty_like(a)
890+
empty[:] = np_fill_value
891+
assert empty[0] == z[0]
892+
assert_array_equal(empty[0:2], z[0:2])
893+
assert_array_equal(empty, z[...])
894+
for f in a.dtype.names:
895+
assert_array_equal(empty[f], z[f])
896+
897+
# store data in array
870898
z[...] = a
899+
900+
# check stored data
871901
if len(a):
872902
assert a[0] == z[0]
873-
assert_array_equal(a, z[...])
874-
assert_array_equal(a['foo'], z['foo'])
875-
assert_array_equal(a['bar'], z['bar'])
876-
assert_array_equal(a['baz'], z['baz'])
903+
assert a[-1] == z[-1]
904+
assert_array_equal(a[0:2], z[0:2])
905+
assert_array_equal(a, z[...])
906+
for f in a.dtype.names:
907+
assert_array_equal(a[f], z[f])
908+
909+
def test_structured_array(self):
910+
d = np.array([(b'aaa', 1, 4.2),
911+
(b'bbb', 2, 8.4),
912+
(b'ccc', 3, 12.6)],
913+
dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])
914+
fill_values = None, b'', (b'zzz', 42, 16.8)
915+
self.check_structured_array(d, fill_values)
916+
917+
def test_structured_array_subshapes(self):
918+
d = np.array([(0, ((0, 1, 2), (1, 2, 3)), b'aaa'),
919+
(1, ((1, 2, 3), (2, 3, 4)), b'bbb'),
920+
(2, ((2, 3, 4), (3, 4, 5)), b'ccc')],
921+
dtype=[('foo', 'i8'), ('bar', '(2, 3)f4'), ('baz', 'S3')])
922+
fill_values = None, b'', (0, ((0, 0, 0), (1, 1, 1)), b'zzz')
923+
self.check_structured_array(d, fill_values)
924+
925+
def test_structured_array_nested(self):
926+
d = np.array([(0, (0, ((0, 1), (1, 2), (2, 3)), 0), b'aaa'),
927+
(1, (1, ((1, 2), (2, 3), (3, 4)), 1), b'bbb'),
928+
(2, (2, ((2, 3), (3, 4), (4, 5)), 2), b'ccc')],
929+
dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'),
930+
('baz', 'u1')]), ('baz', 'S3')])
931+
fill_values = None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz')
932+
self.check_structured_array(d, fill_values)
877933

878934
def test_dtypes(self):
879935

@@ -1559,10 +1615,22 @@ def test_astype(self):
15591615
expected = data.astype(astype)
15601616
assert_array_equal(expected, z2)
15611617

1618+
def test_array_dtype_shape(self):
1619+
# skip this one, cannot do delta on unstructured array
1620+
pass
1621+
15621622
def test_structured_array(self):
15631623
# skip this one, cannot do delta on structured array
15641624
pass
15651625

1626+
def test_structured_array_subshapes(self):
1627+
# skip this one, cannot do delta on structured array
1628+
pass
1629+
1630+
def test_structured_array_nested(self):
1631+
# skip this one, cannot do delta on structured array
1632+
pass
1633+
15661634
def test_dtypes(self):
15671635
# skip this one, delta messes up floats
15681636
pass

zarr/tests/test_meta.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,88 @@ def test_encode_decode_array_2():
116116
assert [df.get_config()] == meta_dec['filters']
117117

118118

119+
def test_encode_decode_array_dtype_shape():
120+
121+
meta = dict(
122+
shape=(100,),
123+
chunks=(10,),
124+
dtype=np.dtype('(10, 10)f8'),
125+
compressor=Zlib(1).get_config(),
126+
fill_value=None,
127+
filters=None,
128+
order='C'
129+
)
130+
131+
meta_json = '''{
132+
"chunks": [10],
133+
"compressor": {"id": "zlib", "level": 1},
134+
"dtype": "<f8",
135+
"fill_value": null,
136+
"filters": null,
137+
"order": "C",
138+
"shape": [100, 10, 10],
139+
"zarr_format": %s
140+
}''' % ZARR_FORMAT
141+
142+
# test encoding
143+
meta_enc = encode_array_metadata(meta)
144+
assert_json_equal(meta_json, meta_enc)
145+
146+
# test decoding
147+
meta_dec = decode_array_metadata(meta_enc)
148+
assert ZARR_FORMAT == meta_dec['zarr_format']
149+
# to maintain consistency with numpy unstructured arrays, unpack dimensions into shape
150+
assert meta['shape'] + meta['dtype'].shape == meta_dec['shape']
151+
assert meta['chunks'] == meta_dec['chunks']
152+
# to maintain consistency with numpy unstructured arrays, unpack dtypes
153+
assert meta['dtype'].base == meta_dec['dtype']
154+
assert meta['compressor'] == meta_dec['compressor']
155+
assert meta['order'] == meta_dec['order']
156+
assert meta_dec['fill_value'] is None
157+
assert meta_dec['filters'] is None
158+
159+
160+
def test_encode_decode_array_structured():
161+
162+
meta = dict(
163+
shape=(100,),
164+
chunks=(10,),
165+
dtype=np.dtype('i8, (10, 10)f8, (5, 10, 15)u1'),
166+
compressor=Zlib(1).get_config(),
167+
fill_value=None,
168+
filters=None,
169+
order='C'
170+
)
171+
172+
meta_json = '''{
173+
"chunks": [10],
174+
"compressor": {"id": "zlib", "level": 1},
175+
"dtype": [["f0", "<i8"], ["f1", "<f8", [10, 10]], ["f2", "|u1", [5, 10, 15]]],
176+
"fill_value": null,
177+
"filters": null,
178+
"order": "C",
179+
"shape": [100],
180+
"zarr_format": %s
181+
}''' % ZARR_FORMAT
182+
183+
# test encoding
184+
meta_enc = encode_array_metadata(meta)
185+
assert_json_equal(meta_json, meta_enc)
186+
187+
# test decoding
188+
meta_dec = decode_array_metadata(meta_enc)
189+
assert ZARR_FORMAT == meta_dec['zarr_format']
190+
# to maintain consistency with numpy unstructured arrays, unpack dimensions into shape
191+
assert meta['shape'] + meta['dtype'].shape == meta_dec['shape']
192+
assert meta['chunks'] == meta_dec['chunks']
193+
# to maintain consistency with numpy unstructured arrays, unpack dimensions into shape
194+
assert meta['dtype'].base == meta_dec['dtype']
195+
assert meta['compressor'] == meta_dec['compressor']
196+
assert meta['order'] == meta_dec['order']
197+
assert meta_dec['fill_value'] is None
198+
assert meta_dec['filters'] is None
199+
200+
119201
def test_encode_decode_fill_values_nan():
120202

121203
fills = (

0 commit comments

Comments
 (0)