Skip to content

Commit f49ae89

Browse files
committed
Merge branch 'json_metadata'
2 parents 683e829 + 4dddb80 commit f49ae89

File tree

7 files changed

+288
-79
lines changed

7 files changed

+288
-79
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,4 @@ zarr/version.py
6767

6868
# test data
6969
*.zarr
70+
*~

PERSISTENCE.rst

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
zarr - Persistence
2+
==================
3+
4+
This document describes the file organisation and formats used to save zarr
5+
arrays on disk.
6+
7+
All data and metadata associated with a zarr array is stored within a
8+
directory on the file system. Within this directory there are a number
9+
of files and sub-directories storing different components of the data
10+
and metadata. Here I'll refer to a directory containing a zarr array
11+
as a root directory.
12+
13+
Configuration metadata
14+
----------------------
15+
16+
Within a root directory, a file called "__zmeta__" contains essential
17+
configuration metadata about the array. This comprises the shape of the
18+
array, chunk shape, data type (dtype), compression library,
19+
compression level, shuffle filter and default fill value for
20+
uninitialised portions of the array. The format of this file is JSON.
21+
22+
Mandatory fields and allowed values are as follows:
23+
24+
* ``shape`` - list of integers - the size of each dimension of the array
25+
* ``chunks`` - list of integers - the size of each dimension of a chunk, i.e., the chunk shape
26+
* ``dtype`` - string or list of lists - a description of the data type, following Numpy convention
27+
* ``fill_value`` - scalar value - value to use for uninitialised portions of the array
28+
* ``cname`` - string - name of the compression library used
29+
* ``clevel`` - integer - compression level
30+
* ``shuffle`` - integer - shuffle filter (0 = no shuffle, 1 = byte shuffle, 2 = bit shuffle)
31+
32+
For example::
33+
34+
>>> import zarr
35+
>>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
36+
... chunks=(10000, 100), dtype='i4', fill_value=42,
37+
... cname='lz4', clevel=3, shuffle=1)
38+
>>> print(open('example.zarr/__zmeta__').read())
39+
{
40+
"chunks": [
41+
10000,
42+
100
43+
],
44+
"clevel": 3,
45+
"cname": "lz4",
46+
"dtype": "<i4",
47+
"fill_value": 42,
48+
"shape": [
49+
1000000,
50+
1000
51+
],
52+
"shuffle": 1
53+
}
54+
55+
User metadata (attributes)
56+
--------------------------
57+
58+
Within a root directory, a file called "__zattr__" contains user
59+
metadata associated with the array, i.e., user attributes. The format
60+
of this file is JSON.
61+
62+
For example::
63+
64+
>>> import zarr
65+
>>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
66+
... chunks=(10000, 100), dtype='i4', fill_value=42,
67+
... cname='lz4', clevel=3, shuffle=1)
68+
>>> z.attrs['foo'] = 42
69+
>>> z.attrs['bar'] = 4.2
70+
>>> z.attrs['baz'] = 'quux'
71+
>>> print(open('example.zarr/__zattr__').read())
72+
73+
TODO add results above
74+
75+
Array data
76+
----------
77+
78+
Within a root directory, a sub-directory called "__zdata__" contains
79+
the array data. The array data is divided into chunks, each of which
80+
is compressed using the [blosc meta-compression library](TODO). Each
81+
chunk is stored in a separate file.
82+
83+
The chunk files are named according to the chunk indices. E.g., for a
84+
2-dimensional array with shape (100, 100) and chunk shape (10, 10)
85+
there will be 100 chunks in total. The file "0.0.blosc" stores data
86+
for the chunk with indices (0, 0) within chunk rows and columns
87+
respectively, i.e., the first chunk, containing data for the segment
88+
of the array that would be obtained by the slice ``z[0:10, 0:10]``;
89+
the file "4.2.blosc" stores the chunk in the fifth row third column,
90+
containing data for the slize ``z[40:50, 20:30]``; etc.
91+
92+
Each chunk file is a binary file following the blosc version 1 format,
93+
comprising a 16 byte header followed by the compressed data. The
94+
header is organised as follows::
95+
96+
|-0-|-1-|-2-|-3-|-4-|-5-|-6-|-7-|-8-|-9-|-A-|-B-|-C-|-D-|-E-|-F-|
97+
^ ^ ^ ^ | nbytes | blocksize | cbytes |
98+
| | | |
99+
| | | +--typesize
100+
| | +------flags
101+
| +----------blosclz version
102+
+--------------blosc version
103+
104+
For more details on the header, see the [C-Blosc header
105+
description](https://github.com/Blosc/c-blosc/blob/master/README_HEADER.rst).
106+
107+
If a file does not exist on the file system for any given chunk in an
108+
array, that indicates the chunk has not been initialised, and the
109+
chunk should be interpreted as completely filled with whatever value
110+
has been configured as the fill value for the array. I.e., chunk files
111+
are not required to exist.
112+
113+
For example::
114+
115+
>>> import zarr
116+
>>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
117+
... chunks=(10000, 100), dtype='i4', fill_value=42,
118+
... cname='lz4', clevel=3, shuffle=1)
119+
>>> import os
120+
>>> os.listdir('example.zarr/__zdata__')
121+
[]
122+
>>> z[:] = 0
123+
>>> sorted(os.listdir('example.zarr/__zdata__'))[:5]
124+
['0.0.blosc', '0.1.blosc', '0.2.blosc', '0.3.blosc', '0.4.blosc']

README.rst

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ Python.
1313
Installation
1414
------------
1515

16-
Installation requires Numpy and Cython pre-installed. Currently only
17-
compatible with Python >= 3.4.
16+
Installation requires Numpy and Cython pre-installed. Can only be installed on
17+
Linux currently.
1818

1919
Install from PyPI::
2020

@@ -117,10 +117,6 @@ append data to any axis
117117
118118
>>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
119119
>>> z = zarr.array(a, chunks=(1000, 100))
120-
>>> z
121-
zarr.ext.SynchronizedArray((10000, 1000), int32, chunks=(1000, 100))
122-
cname: blosclz; clevel: 5; shuffle: 1 (BYTESHUFFLE)
123-
nbytes: 38.1M; cbytes: 2.0M; ratio: 19.3; initialized: 100/100
124120
>>> z.append(a+a)
125121
>>> z
126122
zarr.ext.SynchronizedArray((20000, 1000), int32, chunks=(1000, 100))
@@ -132,6 +128,9 @@ append data to any axis
132128
cname: blosclz; clevel: 5; shuffle: 1 (BYTESHUFFLE)
133129
nbytes: 152.6M; cbytes: 7.6M; ratio: 20.2; initialized: 400/400
134130
131+
Persistence
132+
-----------
133+
135134
Create a persistent array (data stored on disk)
136135

137136
.. code-block:: python
@@ -160,7 +159,8 @@ If you're working with really big arrays, try the 'lazy' option
160159
nbytes: 3.6P; cbytes: 0; initialized: 0/1000000000
161160
mode: a; path: big.zarr
162161
163-
Yes, that is 3.6 petabytes.
162+
See the [persistence documentation](PERSISTENCE.rst) for more details of the
163+
file format.
164164

165165
Tuning
166166
------

zarr/ext.pyx

Lines changed: 40 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import sys
1212
import os
1313
import struct
1414
import ctypes
15-
import pickle
1615
import shutil
1716
import tempfile
1817
from collections import namedtuple
@@ -21,8 +20,7 @@ import multiprocessing
2120
import fasteners
2221

2322

24-
from zarr import util as _util
25-
from zarr import defaults
23+
from zarr import util as _util, meta as _meta, defaults as _defaults
2624

2725

2826
###############################################################################
@@ -163,21 +161,21 @@ def _normalize_cparams(cname=None, clevel=None, shuffle=None):
163161
"""
164162

165163
# determine compressor
166-
cname = cname if cname is not None else defaults.cname
164+
cname = cname if cname is not None else _defaults.cname
167165
if type(cname) != bytes:
168-
cname = cname.encode()
166+
cname = cname.encode('ascii')
169167
# check compressor is available
170168
if blosc_compname_to_compcode(cname) < 0:
171169
raise ValueError('compressor not available: %s' % cname)
172170

173171
# determine compression level
174-
clevel = clevel if clevel is not None else defaults.clevel
172+
clevel = clevel if clevel is not None else _defaults.clevel
175173
clevel = int(clevel)
176174
if clevel < 0 or clevel > 9:
177175
raise ValueError('invalid compression level: %s' % clevel)
178176

179177
# determine shuffle filter
180-
shuffle = shuffle if shuffle is not None else defaults.shuffle
178+
shuffle = shuffle if shuffle is not None else _defaults.shuffle
181179
shuffle = int(shuffle)
182180
if shuffle not in [0, 1, 2]:
183181
raise ValueError('invalid shuffle: %s' % shuffle)
@@ -747,28 +745,6 @@ def _normalize_chunks(chunks, tuple shape):
747745
return chunks
748746

749747

750-
def _read_array_metadata(path):
751-
752-
# check path exists
753-
if not os.path.exists(path):
754-
raise ValueError('path not found: %s' % path)
755-
756-
# check metadata file
757-
meta_path = os.path.join(path, defaults.metapath)
758-
if not os.path.exists(meta_path):
759-
raise ValueError('array metadata not found: %s' % path)
760-
761-
with open(meta_path, 'rb') as f:
762-
meta = pickle.load(f)
763-
return meta
764-
765-
766-
def _write_array_metadata(path, meta):
767-
meta_path = os.path.join(path, defaults.metapath)
768-
with open(meta_path, 'wb') as f:
769-
pickle.dump(meta, f, protocol=0)
770-
771-
772748
def _array_resize(BaseArray array, *args):
773749

774750
# normalize new shape argument
@@ -1216,7 +1192,7 @@ cdef class PersistentArray(BaseArray):
12161192
# a : read/write if exists, create otherwise (default)
12171193

12181194
# use metadata file as indicator of array existence
1219-
meta_path = os.path.join(path, defaults.metapath)
1195+
meta_path = os.path.join(path, _defaults.metapath)
12201196

12211197
if mode in ['r', 'r+']:
12221198
self._open(path, **kwargs)
@@ -1264,7 +1240,7 @@ cdef class PersistentArray(BaseArray):
12641240
cname=None, clevel=None, shuffle=None, fill_value=None):
12651241

12661242
# create directories
1267-
data_path = os.path.join(path, defaults.datapath)
1243+
data_path = os.path.join(path, _defaults.datapath)
12681244
if not os.path.exists(data_path):
12691245
os.makedirs(data_path)
12701246

@@ -1277,20 +1253,20 @@ cdef class PersistentArray(BaseArray):
12771253
self._fill_value = fill_value
12781254

12791255
# write metadata
1280-
metadata = {'shape': self._shape,
1281-
'chunks': self._chunks,
1282-
'dtype': self._dtype,
1283-
'cname': self._cname,
1284-
'clevel': self._clevel,
1285-
'shuffle': self._shuffle,
1286-
'fill_value': self._fill_value}
1287-
_write_array_metadata(path, metadata)
1256+
_meta.write_array_metadata(path,
1257+
shape=self._shape,
1258+
chunks=self._chunks,
1259+
dtype=self._dtype,
1260+
cname=self._cname,
1261+
clevel=self._clevel,
1262+
shuffle=self._shuffle,
1263+
fill_value=self._fill_value)
12881264

12891265
def _open(self, path, shape=None, chunks=None, dtype=None, cname=None,
12901266
clevel=None, shuffle=None, fill_value=None):
12911267

12921268
# read metadata
1293-
metadata = _read_array_metadata(path)
1269+
metadata = _meta.read_array_metadata(path)
12941270

12951271
# set attributes
12961272
self._shape = metadata['shape']
@@ -1327,8 +1303,8 @@ cdef class PersistentArray(BaseArray):
13271303
return self._cdata[cidx]
13281304

13291305
cdef object get_chunk_path(self, tuple cidx):
1330-
chunk_filename = '.'.join(map(str, cidx)) + defaults.datasuffix
1331-
chunk_path = os.path.join(self._path, defaults.datapath,
1306+
chunk_filename = '.'.join(map(str, cidx)) + _defaults.datasuffix
1307+
chunk_path = os.path.join(self._path, _defaults.datapath,
13321308
chunk_filename)
13331309
return chunk_path
13341310

@@ -1347,14 +1323,14 @@ cdef class PersistentArray(BaseArray):
13471323
_array_resize(self, *args)
13481324

13491325
# write metadata
1350-
metadata = {'shape': self._shape,
1351-
'chunks': self._chunks,
1352-
'dtype': self._dtype,
1353-
'cname': self._cname,
1354-
'clevel': self._clevel,
1355-
'shuffle': self._shuffle,
1356-
'fill_value': self._fill_value}
1357-
_write_array_metadata(self._path, metadata)
1326+
_meta.write_array_metadata(self._path,
1327+
shape=self._shape,
1328+
chunks=self._chunks,
1329+
dtype=self._dtype,
1330+
cname=self._cname,
1331+
clevel=self._clevel,
1332+
shuffle=self._shuffle,
1333+
fill_value=self._fill_value)
13581334

13591335
def __setitem__(self, key, value):
13601336
if self._mode == 'r':
@@ -1534,18 +1510,18 @@ cdef class LazyPersistentArray(PersistentArray):
15341510
def __get__(self):
15351511
# N.B., chunk objects are instantiated lazily, so there may be
15361512
# data on disk but no corresponding chunk object yet
1537-
data_dir = os.path.join(self._path, defaults.datapath)
1513+
data_dir = os.path.join(self._path, _defaults.datapath)
15381514
return sum(os.path.getsize(os.path.join(data_dir, fn))
15391515
for fn in os.listdir(data_dir))
15401516

15411517
property is_initialized:
15421518
def __get__(self):
15431519
# N.B., chunk objects are instantiated lazily, so there may be
15441520
# data on disk but no corresponding chunk object yet
1545-
data_dir = os.path.join(self._path, defaults.datapath)
1521+
data_dir = os.path.join(self._path, _defaults.datapath)
15461522
a = np.zeros(self._cdata_shape, dtype='b1')
1547-
for fn in glob(os.path.join(data_dir, '*' + defaults.datasuffix)):
1548-
bn = os.path.basename(fn)[:-len(defaults.datasuffix)]
1523+
for fn in glob(os.path.join(data_dir, '*' + _defaults.datasuffix)):
1524+
bn = os.path.basename(fn)[:-len(_defaults.datasuffix)]
15491525
cidx = tuple(map(int, bn.split('.')))
15501526
a[cidx] = True
15511527
return a
@@ -1587,14 +1563,14 @@ cdef class LazyPersistentArray(PersistentArray):
15871563
_lazy_resize(self, *args)
15881564

15891565
# write metadata
1590-
metadata = {'shape': self._shape,
1591-
'chunks': self._chunks,
1592-
'dtype': self._dtype,
1593-
'cname': self._cname,
1594-
'clevel': self._clevel,
1595-
'shuffle': self._shuffle,
1596-
'fill_value': self._fill_value}
1597-
_write_array_metadata(self._path, metadata)
1566+
_meta.write_array_metadata(self._path,
1567+
shape=self._shape,
1568+
chunks=self._chunks,
1569+
dtype=self._dtype,
1570+
cname=self._cname,
1571+
clevel=self._clevel,
1572+
shuffle=self._shuffle,
1573+
fill_value=self._fill_value)
15981574

15991575

16001576
# noinspection PyAbstractClass
@@ -1609,8 +1585,8 @@ cdef class SynchronizedLazyPersistentArray(LazyPersistentArray):
16091585
return _lazy_get_chunk(self, cidx)
16101586

16111587
cdef BaseChunk create_chunk(self, tuple cidx):
1612-
chunk_filename = '.'.join(map(str, cidx)) + defaults.datasuffix
1613-
chunk_path = os.path.join(self._path, defaults.datapath,
1588+
chunk_filename = '.'.join(map(str, cidx)) + _defaults.datasuffix
1589+
chunk_path = os.path.join(self._path, _defaults.datapath,
16141590
chunk_filename)
16151591
return SynchronizedPersistentChunk(
16161592
path=chunk_path, shape=self._chunks, dtype=self._dtype,

0 commit comments

Comments
 (0)