zarr-developers
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎PERSISTENCE.rst
Lines changed: 124 additions & 0 deletions b/‎PERSISTENCE.rst
Lines changed: 124 additions & 0 deletions
diff --git a/‎README.rst
Lines changed: 7 additions & 7 deletions b/‎README.rst
Lines changed: 7 additions & 7 deletions
diff --git a/‎zarr/ext.pyx
Lines changed: 40 additions & 64 deletions b/‎zarr/ext.pyx
Lines changed: 40 additions & 64 deletions
@@ -67,3 +67,4 @@ zarr/version.py
 
 # test data
 *.zarr
+*~
@@ -0,0 +1,124 @@
+zarr - Persistence
+==================
+
+This document describes the file organisation and formats used to save zarr
+arrays on disk.
+
+All data and metadata associated with a zarr array is stored within a
+directory on the file system. Within this directory there are a number
+of files and sub-directories storing different components of the data
+and metadata. Here I'll refer to a directory containing a zarr array
+as a root directory.
+
+Configuration metadata
+----------------------
+
+Within a root directory, a file called "__zmeta__" contains essential
+configuration metadata about the array. This comprises the shape of the
+array, chunk shape, data type (dtype), compression library,
+compression level, shuffle filter and default fill value for
+uninitialised portions of the array. The format of this file is JSON.
+
+Mandatory fields and allowed values are as follows:
+
+* ``shape`` - list of integers - the size of each dimension of the array
+* ``chunks`` - list of integers - the size of each dimension of a chunk, i.e., the chunk shape
+* ``dtype`` - string or list of lists - a description of the data type, following Numpy convention
+* ``fill_value`` - scalar value - value to use for uninitialised portions of the array
+* ``cname`` - string - name of the compression library used
+* ``clevel`` - integer - compression level
+* ``shuffle`` - integer - shuffle filter (0 = no shuffle, 1 = byte shuffle, 2 = bit shuffle)
+
+For example::
+
+    >>> import zarr
+    >>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
+    ...               chunks=(10000, 100), dtype='i4', fill_value=42,
+    ...               cname='lz4', clevel=3, shuffle=1)
+    >>> print(open('example.zarr/__zmeta__').read())
+    {
+        "chunks": [
+            10000,
+            100
+        ],
+        "clevel": 3,
+        "cname": "lz4",
+        "dtype": "<i4",
+        "fill_value": 42,
+        "shape": [
+            1000000,
+            1000
+        ],
+        "shuffle": 1
+    }
+
+User metadata (attributes)
+--------------------------
+
+Within a root directory, a file called "__zattr__" contains user
+metadata associated with the array, i.e., user attributes. The format
+of this file is JSON.
+
+For example::
+  
+    >>> import zarr
+    >>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
+    ...               chunks=(10000, 100), dtype='i4', fill_value=42,
+    ...               cname='lz4', clevel=3, shuffle=1)
+    >>> z.attrs['foo'] = 42
+    >>> z.attrs['bar'] = 4.2
+    >>> z.attrs['baz'] = 'quux'
+    >>> print(open('example.zarr/__zattr__').read())
+
+TODO add results above
+
+Array data
+----------
+
+Within a root directory, a sub-directory called "__zdata__" contains
+the array data. The array data is divided into chunks, each of which
+is compressed using the [blosc meta-compression library](TODO). Each
+chunk is stored in a separate file.
+
+The chunk files are named according to the chunk indices. E.g., for a
+2-dimensional array with shape (100, 100) and chunk shape (10, 10)
+there will be 100 chunks in total. The file "0.0.blosc" stores data
+for the chunk with indices (0, 0) within chunk rows and columns
+respectively, i.e., the first chunk, containing data for the segment
+of the array that would be obtained by the slice ``z[0:10, 0:10]``;
+the file "4.2.blosc" stores the chunk in the fifth row third column,
+containing data for the slize ``z[40:50, 20:30]``; etc.
+
+Each chunk file is a binary file following the blosc version 1 format,
+comprising a 16 byte header followed by the compressed data. The
+header is organised as follows::
+
+    |-0-|-1-|-2-|-3-|-4-|-5-|-6-|-7-|-8-|-9-|-A-|-B-|-C-|-D-|-E-|-F-|
+      ^   ^   ^   ^ |     nbytes    |   blocksize   |    cbytes     |
+      |   |   |   |
+      |   |   |   +--typesize
+      |   |   +------flags
+      |   +----------blosclz version
+      +--------------blosc version
+
+For more details on the header, see the [C-Blosc header
+description](https://github.com/Blosc/c-blosc/blob/master/README_HEADER.rst).
+
+If a file does not exist on the file system for any given chunk in an
+array, that indicates the chunk has not been initialised, and the
+chunk should be interpreted as completely filled with whatever value
+has been configured as the fill value for the array. I.e., chunk files
+are not required to exist.
+
+For example::
+
+    >>> import zarr
+    >>> z = zarr.open('example.zarr', mode='w', shape=(1000000, 1000),
+    ...               chunks=(10000, 100), dtype='i4', fill_value=42,
+    ...               cname='lz4', clevel=3, shuffle=1)
+    >>> import os
+    >>> os.listdir('example.zarr/__zdata__')
+    []
+    >>> z[:] = 0
+    >>> sorted(os.listdir('example.zarr/__zdata__'))[:5]
+    ['0.0.blosc', '0.1.blosc', '0.2.blosc', '0.3.blosc', '0.4.blosc']
@@ -13,8 +13,8 @@ Python.
 Installation
 ------------
 
-Installation requires Numpy and Cython pre-installed. Currently only
-compatible with Python >= 3.4.
+Installation requires Numpy and Cython pre-installed. Can only be installed on
+Linux currently.
 
 Install from PyPI::
 
@@ -117,10 +117,6 @@ append data to any axis
 
     >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
     >>> z = zarr.array(a, chunks=(1000, 100))
-    >>> z
-    zarr.ext.SynchronizedArray((10000, 1000), int32, chunks=(1000, 100))
-      cname: blosclz; clevel: 5; shuffle: 1 (BYTESHUFFLE)
-      nbytes: 38.1M; cbytes: 2.0M; ratio: 19.3; initialized: 100/100
     >>> z.append(a+a)
     >>> z
     zarr.ext.SynchronizedArray((20000, 1000), int32, chunks=(1000, 100))
@@ -132,6 +128,9 @@ append data to any axis
       cname: blosclz; clevel: 5; shuffle: 1 (BYTESHUFFLE)
       nbytes: 152.6M; cbytes: 7.6M; ratio: 20.2; initialized: 400/400
 
+Persistence
+-----------
+
 Create a persistent array (data stored on disk)
 
 .. code-block:: python
@@ -160,7 +159,8 @@ If you're working with really big arrays, try the 'lazy' option
       nbytes: 3.6P; cbytes: 0; initialized: 0/1000000000
       mode: a; path: big.zarr
 
-Yes, that is 3.6 petabytes.
+See the [persistence documentation](PERSISTENCE.rst) for more details of the
+file format.
 
 Tuning
 ------
 
@@ -12,7 +12,6 @@ import sys
 import os
 import struct
 import ctypes
-import pickle
 import shutil
 import tempfile
 from collections import namedtuple
@@ -21,8 +20,7 @@ import multiprocessing
 import fasteners
 
 
-from zarr import util as _util
-from zarr import defaults
+from zarr import util as _util, meta as _meta, defaults as _defaults
 
 
 ###############################################################################
@@ -163,21 +161,21 @@ def _normalize_cparams(cname=None, clevel=None, shuffle=None):
     """
 
     # determine compressor
-    cname = cname if cname is not None else defaults.cname
+    cname = cname if cname is not None else _defaults.cname
     if type(cname) != bytes:
-        cname = cname.encode()
+        cname = cname.encode('ascii')
     # check compressor is available
     if blosc_compname_to_compcode(cname) < 0:
         raise ValueError('compressor not available: %s' % cname)
 
     # determine compression level
-    clevel = clevel if clevel is not None else defaults.clevel
+    clevel = clevel if clevel is not None else _defaults.clevel
     clevel = int(clevel)
     if clevel < 0 or clevel > 9:
         raise ValueError('invalid compression level: %s' % clevel)
 
     # determine shuffle filter
-    shuffle = shuffle if shuffle is not None else defaults.shuffle
+    shuffle = shuffle if shuffle is not None else _defaults.shuffle
     shuffle = int(shuffle)
     if shuffle not in [0, 1, 2]:
         raise ValueError('invalid shuffle: %s' % shuffle)
@@ -747,28 +745,6 @@ def _normalize_chunks(chunks, tuple shape):
     return chunks
 
 
-def _read_array_metadata(path):
-
-    # check path exists
-    if not os.path.exists(path):
-        raise ValueError('path not found: %s' % path)
-
-    # check metadata file
-    meta_path = os.path.join(path, defaults.metapath)
-    if not os.path.exists(meta_path):
-        raise ValueError('array metadata not found: %s' % path)
-
-    with open(meta_path, 'rb') as f:
-        meta = pickle.load(f)
-        return meta
-
-
-def _write_array_metadata(path, meta):
-    meta_path = os.path.join(path, defaults.metapath)
-    with open(meta_path, 'wb') as f:
-        pickle.dump(meta, f, protocol=0)
-
-
 def _array_resize(BaseArray array, *args):
 
     # normalize new shape argument
@@ -1216,7 +1192,7 @@ cdef class PersistentArray(BaseArray):
         # a : read/write if exists, create otherwise (default)
 
         # use metadata file as indicator of array existence
-        meta_path = os.path.join(path, defaults.metapath)
+        meta_path = os.path.join(path, _defaults.metapath)
 
         if mode in ['r', 'r+']:
             self._open(path, **kwargs)
@@ -1264,7 +1240,7 @@ cdef class PersistentArray(BaseArray):
                 cname=None, clevel=None, shuffle=None, fill_value=None):
 
         # create directories
-        data_path = os.path.join(path, defaults.datapath)
+        data_path = os.path.join(path, _defaults.datapath)
         if not os.path.exists(data_path):
             os.makedirs(data_path)
 
@@ -1277,20 +1253,20 @@ cdef class PersistentArray(BaseArray):
         self._fill_value = fill_value
 
         # write metadata
-        metadata = {'shape': self._shape,
-                    'chunks': self._chunks,
-                    'dtype': self._dtype,
-                    'cname': self._cname,
-                    'clevel': self._clevel,
-                    'shuffle': self._shuffle,
-                    'fill_value': self._fill_value}
-        _write_array_metadata(path, metadata)
+        _meta.write_array_metadata(path,
+                                   shape=self._shape,
+                                   chunks=self._chunks,
+                                   dtype=self._dtype,
+                                   cname=self._cname,
+                                   clevel=self._clevel,
+                                   shuffle=self._shuffle,
+                                   fill_value=self._fill_value)
 
     def _open(self, path, shape=None, chunks=None, dtype=None, cname=None,
               clevel=None, shuffle=None, fill_value=None):
 
         # read metadata
-        metadata = _read_array_metadata(path)
+        metadata = _meta.read_array_metadata(path)
 
         # set attributes
         self._shape = metadata['shape']
@@ -1327,8 +1303,8 @@ cdef class PersistentArray(BaseArray):
         return self._cdata[cidx]
 
     cdef object get_chunk_path(self, tuple cidx):
-        chunk_filename = '.'.join(map(str, cidx)) + defaults.datasuffix
-        chunk_path = os.path.join(self._path, defaults.datapath,
+        chunk_filename = '.'.join(map(str, cidx)) + _defaults.datasuffix
+        chunk_path = os.path.join(self._path, _defaults.datapath,
                                   chunk_filename)
         return chunk_path
 
@@ -1347,14 +1323,14 @@ cdef class PersistentArray(BaseArray):
         _array_resize(self, *args)
 
         # write metadata
-        metadata = {'shape': self._shape,
-                    'chunks': self._chunks,
-                    'dtype': self._dtype,
-                    'cname': self._cname,
-                    'clevel': self._clevel,
-                    'shuffle': self._shuffle,
-                    'fill_value': self._fill_value}
-        _write_array_metadata(self._path, metadata)
+        _meta.write_array_metadata(self._path,
+                                   shape=self._shape,
+                                   chunks=self._chunks,
+                                   dtype=self._dtype,
+                                   cname=self._cname,
+                                   clevel=self._clevel,
+                                   shuffle=self._shuffle,
+                                   fill_value=self._fill_value)
 
     def __setitem__(self, key, value):
         if self._mode == 'r':
@@ -1534,18 +1510,18 @@ cdef class LazyPersistentArray(PersistentArray):
         def __get__(self):
             # N.B., chunk objects are instantiated lazily, so there may be
             # data on disk but no corresponding chunk object yet
-            data_dir = os.path.join(self._path, defaults.datapath)
+            data_dir = os.path.join(self._path, _defaults.datapath)
             return sum(os.path.getsize(os.path.join(data_dir, fn))
                        for fn in os.listdir(data_dir))
 
     property is_initialized:
         def __get__(self):
             # N.B., chunk objects are instantiated lazily, so there may be
             # data on disk but no corresponding chunk object yet
-            data_dir = os.path.join(self._path, defaults.datapath)
+            data_dir = os.path.join(self._path, _defaults.datapath)
             a = np.zeros(self._cdata_shape, dtype='b1')
-            for fn in glob(os.path.join(data_dir, '*' + defaults.datasuffix)):
-                bn = os.path.basename(fn)[:-len(defaults.datasuffix)]
+            for fn in glob(os.path.join(data_dir, '*' + _defaults.datasuffix)):
+                bn = os.path.basename(fn)[:-len(_defaults.datasuffix)]
                 cidx = tuple(map(int, bn.split('.')))
                 a[cidx] = True
             return a
@@ -1587,14 +1563,14 @@ cdef class LazyPersistentArray(PersistentArray):
         _lazy_resize(self, *args)
 
         # write metadata
-        metadata = {'shape': self._shape,
-                    'chunks': self._chunks,
-                    'dtype': self._dtype,
-                    'cname': self._cname,
-                    'clevel': self._clevel,
-                    'shuffle': self._shuffle,
-                    'fill_value': self._fill_value}
-        _write_array_metadata(self._path, metadata)
+        _meta.write_array_metadata(self._path,
+                                   shape=self._shape,
+                                   chunks=self._chunks,
+                                   dtype=self._dtype,
+                                   cname=self._cname,
+                                   clevel=self._clevel,
+                                   shuffle=self._shuffle,
+                                   fill_value=self._fill_value)
 
 
 # noinspection PyAbstractClass
@@ -1609,8 +1585,8 @@ cdef class SynchronizedLazyPersistentArray(LazyPersistentArray):
             return _lazy_get_chunk(self, cidx)
 
     cdef BaseChunk create_chunk(self, tuple cidx):
-        chunk_filename = '.'.join(map(str, cidx)) + defaults.datasuffix
-        chunk_path = os.path.join(self._path, defaults.datapath,
+        chunk_filename = '.'.join(map(str, cidx)) + _defaults.datasuffix
+        chunk_path = os.path.join(self._path, _defaults.datapath,
                                   chunk_filename)
         return SynchronizedPersistentChunk(
             path=chunk_path, shape=self._chunks, dtype=self._dtype,
Original file line number	Diff line number	Diff line change
`@@ -67,3 +67,4 @@ zarr/version.py`
`67`	`67`
`68`	`68`	`# test data`
`69`	`69`	`*.zarr`
	`70`	`+*~`