doco

alimanfoo · alimanfoo · commit 08048d33a5bf · 2015-12-17T18:17:42.000Z
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ over data. Try it with [dask.array](http://dask.pydata.org/en/latest/array.html)
 of the same size or larger than chunks. It is not and will never be 
 optimised for single item access. 
 
-Chunks sizes > 1M are generally good. Optimal chunk shape will depend on 
+Chunks sizes >= 1M are generally good. Optimal chunk shape will depend on 
 the correlation structure in your data.
 
 ## Acknowledgments
diff --git a/setup.py b/setup.py
@@ -34,11 +34,16 @@
               ),
 ])
 
+description = 'A minimal implementation of chunked, compressed, ' \
+              'N-dimensional arrays for Python.'
+
+with open('README.md') as f:
+    long_description = f.read()
 
 setup(
     name='zarr',
-    description='A minimal implementation of chunked, compressed, N-dimensional arrays',
-    long_description='TODO',
+    description=description,
+    long_description=long_description,
     use_scm_version={
         'version_scheme': 'guess-next-dev',
         'local_scheme': 'dirty-tag',
diff --git a/zarr/core.py b/zarr/core.py
@@ -10,7 +10,33 @@
 
 def empty(shape, chunks, dtype=None, cname=None, clevel=None, shuffle=None,
           synchronized=True):
-    """TODO"""
+    """Create an empty array.
+
+    Parameters
+    ----------
+    shape : int or tuple of ints
+        Array shape.
+    chunks : int or tuple of ints
+        Chunk shape.
+    dtype : string or dtype, optional
+        NumPy dtype.
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+    synchronized : bool, optional
+        If True, each chunk will be protected with a lock to prevent data
+        collision during write operations.
+
+    Returns
+    -------
+    z : zarr.ext.Array
+
+    """
 
     return _ext.Array(shape, chunks=chunks, dtype=dtype, cname=cname,
                       clevel=clevel, shuffle=shuffle,
@@ -19,16 +45,69 @@ def empty(shape, chunks, dtype=None, cname=None, clevel=None, shuffle=None,
 
 def zeros(shape, chunks, dtype=None, cname=None, clevel=None, shuffle=None,
           synchronized=True):
-    """TODO"""
+    """Create an array filled with zeros.
+
+    Parameters
+    ----------
+    shape : int or tuple of ints
+        Array shape.
+    chunks : int or tuple of ints
+        Chunk shape.
+    dtype : string or dtype, optional
+        NumPy dtype.
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+    synchronized : bool, optional
+        If True, each chunk will be protected with a lock to prevent data
+        collision during write operations.
+
+    Returns
+    -------
+    z : zarr.ext.Array
+
+    """
 
     return _ext.Array(shape, chunks=chunks, dtype=dtype, cname=cname,
                       clevel=clevel, shuffle=shuffle, fill_value=0,
                       synchronized=synchronized)
 
 
 def ones(shape, chunks, dtype=None, cname=None, clevel=None, shuffle=None,
-          synchronized=True):
-    """TODO"""
+         synchronized=True):
+    """Create an array filled with ones.
+
+    Parameters
+    ----------
+    shape : int or tuple of ints
+        Array shape.
+    chunks : int or tuple of ints
+        Chunk shape.
+    dtype : string or dtype, optional
+        NumPy dtype.
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+    synchronized : bool, optional
+        If True, each chunk will be protected with a lock to prevent data
+        collision during write operations.
+
+    Returns
+    -------
+    z : zarr.ext.Array
+
+    """
+
 
     return _ext.Array(shape, chunks=chunks, dtype=dtype, cname=cname,
                       clevel=clevel, shuffle=shuffle, fill_value=1,
@@ -37,7 +116,35 @@ def ones(shape, chunks, dtype=None, cname=None, clevel=None, shuffle=None,
 
 def full(shape, chunks, fill_value, dtype=None, cname=None, clevel=None,
          shuffle=None, synchronized=True):
-    """TODO"""
+    """Create an array filled with `fill_value`.
+
+    Parameters
+    ----------
+    shape : int or tuple of ints
+        Array shape.
+    chunks : int or tuple of ints
+        Chunk shape.
+    fill_value : object
+        Default value to use for uninitialised portions of the array.
+    dtype : string or dtype, optional
+        NumPy dtype.
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+    synchronized : bool, optional
+        If True, each chunk will be protected with a lock to prevent data
+        collision during write operations.
+
+    Returns
+    -------
+    z : zarr.ext.Array
+
+    """
 
     return _ext.Array(shape, chunks=chunks, dtype=dtype, cname=cname,
                       clevel=clevel, shuffle=shuffle, fill_value=fill_value,
@@ -46,7 +153,33 @@ def full(shape, chunks, fill_value, dtype=None, cname=None, clevel=None,
 
 def array(data, chunks=None, dtype=None, cname=None, clevel=None,
           shuffle=None, synchronized=True, fill_value=None):
-    """TODO"""
+    """Create an array filled with `data`.
+
+    Parameters
+    ----------
+    data : array_like
+        Data to store.
+    chunks : int or tuple of ints
+        Chunk shape.
+    dtype : string or dtype, optional
+        NumPy dtype.
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+    synchronized : bool, optional
+        If True, each chunk will be protected with a lock to prevent data
+        collision during write operations.
+
+    Returns
+    -------
+    z : zarr.ext.Array
+
+    """
 
     # ensure data is array-like
     if not hasattr(data, 'shape') or not hasattr(data, 'dtype'):
diff --git a/zarr/ext.pyx b/zarr/ext.pyx
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# cython: embedsignature=True
+# cython: profile=True
 from __future__ import absolute_import, print_function, division
 from threading import RLock
 import itertools
@@ -48,6 +50,8 @@ from zarr import defaults
 
 
 def blosc_version():
+    """Return the version of c-blosc that zarr was compiled with."""
+
     # all the 'decode' contorsions are for Python 3 returning actual strings
     ver_str = <char *> BLOSC_VERSION_STRING
     if hasattr(ver_str, "decode"):
@@ -59,6 +63,23 @@ def blosc_version():
 
 
 def get_cparams(cname=None, clevel=None, shuffle=None):
+    """Convenience function to normalise compression parameters.
+
+    If any values are None, they will be substituted with values from the
+    `zarr.defaults` module.
+
+    Parameters
+    ----------
+    cname : string, optional
+        Name of compression library to use, e.g., 'blosclz', 'lz4', 'zlib',
+        'snappy'.
+    clevel : int, optional
+        Compression level, 0 means no compression.
+    shuffle : int, optional
+        Shuffle filter, 0 means no shuffle, 1 means byte shuffle, 2 means
+        bit shuffle.
+
+    """
 
     # determine compressor
     cname = cname if cname is not None else defaults.cname
@@ -84,6 +105,10 @@ def get_cparams(cname=None, clevel=None, shuffle=None):
 
 
 def is_total_slice(item, shape):
+    """Determine whether `item` specifies a complete slice of array with the
+    given `shape`. Used to optimise __setitem__ operations on the Chunk
+    class."""
+
     if item == Ellipsis:
         return True
     if item == slice(None):
@@ -104,11 +129,7 @@ cdef class Chunk:
                   shuffle=None, fill_value=None):
 
         # set shape and dtype
-        if isinstance(shape, int):
-            shape = (shape,)
-        else:
-            shape = tuple(shape)
-        self.shape = shape
+        self.shape = normalise_shape(shape)
         self.dtype = np.dtype(dtype)
 
         # set compression options
@@ -139,7 +160,6 @@ cdef class Chunk:
 
             else:
                 # ensure array is C contiguous
-                # TODO adapt to either C or F layout
                 array = np.ascontiguousarray(value, dtype=self.dtype)
                 if array.shape != self.shape:
                     raise ValueError('bad value shape')
@@ -161,7 +181,7 @@ cdef class Chunk:
             size_t nbytes, nbytes_check, cbytes, blocksize, itemsize
             char *dest
 
-        # ensure any existing data is cleared
+        # ensure any existing data is cleared and memory freed
         self.clear()
 
         # compute the total number of bytes in the array
@@ -204,7 +224,7 @@ cdef class Chunk:
         array = np.empty(self.shape, dtype=self.dtype)
 
         if self.data == NULL:
-            # data not initialised
+            # data not initialised, use fill_value
             if self.fill_value is not None:
                 array.fill(self.fill_value)
 
@@ -263,6 +283,8 @@ class Synchronized(object):
 
 
 def normalise_array_selection(item, shape):
+    """Convenience function to normalise a selection within an array with
+    the given `shape`."""
 
     # normalise item
     if isinstance(item, int):
@@ -290,6 +312,9 @@ def normalise_array_selection(item, shape):
 
 
 def normalise_axis_selection(item, l):
+    """Convenience function to normalise a selection within a single axis
+    of size `l`."""
+
     if isinstance(item, int):
         if item < 0:
             # handle wraparound
@@ -300,7 +325,7 @@ def normalise_axis_selection(item, l):
 
     elif isinstance(item, slice):
         if item.step is not None and item.step != 1:
-            raise NotImplementedError('TODO')
+            raise NotImplementedError('slice with step not supported')
         start = 0 if item.start is None else item.start
         stop = l if item.stop is None else item.stop
         if start < 0:
@@ -318,12 +343,15 @@ def normalise_axis_selection(item, l):
 
 
 def get_chunk_range(selection, chunks):
+    """Convenience function to get a range over all chunk indices,
+    for iterating over chunks."""
     chunk_range = [range(start//l, int(np.ceil(stop/l)))
                    for (start, stop), l in zip(selection, chunks)]
     return chunk_range
 
 
 def normalise_shape(shape):
+    """Convenience function to normalise the `shape` argument."""
     if isinstance(shape, int):
         shape = (shape,)
     else:
@@ -332,11 +360,14 @@ def normalise_shape(shape):
 
 
 def normalise_chunks(chunks, shape):
+    """Convenience function to normalise the `chunks` argument for an array
+    with the given `shape`."""
     if isinstance(chunks, int):
         chunks = (chunks,)
     else:
         chunks = tuple(chunks)
     if len(chunks) < len(shape):
+        # assume chunks across remaining dimensions
         chunks += shape[len(chunks):]
     if len(chunks) != len(shape):
         raise ValueError('chunks and shape not compatible: %r, %r' %