document copy functions

alimanfoo · alimanfoo · commit f6614b050860 · 2017-12-23T01:20:48.000Z
diff --git a/docs/release.rst b/docs/release.rst
@@ -127,6 +127,13 @@ Enhancements
 * **Added support for ``datetime64`` and ``timedelta64`` data types**;
   :issue:`85`, :issue:`215`.
 
+* **New copy functions**. The new functions :func:`zarr.convenience.copy` and
+  :func:`zarr.convenience.copy_all` provide a way to copy groups and/or arrays
+  between HDF5 and Zarr, or between two Zarr groups. The
+  :func:`zarr.convenience.copy_store` provides a more efficient way to copy
+  data directly between two Zarr stores. :issue:`87`, :issue:`113`,
+  :issue:`137`, :issue:`217`.
+
 Bug fixes
 ~~~~~~~~~
 
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -768,6 +768,110 @@ Here is an example using S3Map to read an array created previously::
     b'Hello from the cloud!'
 
 
+.. _tutorial_copy:
+
+Copying/migrating data
+----------------------
+
+If you have some data in an HDF5 file and would like to copy some or all of it
+into a Zarr group, or vice-versa, the :func:`zarr.convenience.copy` and
+:func:`zarr.convenience.copyall` functions can be used. Here's an example
+copying a group named 'foo' from an HDF5 file to a Zarr group::
+
+    >>> import h5py
+    >>> import zarr
+    >>> import numpy as np
+    >>> source = h5py.File('data/example.h5', mode='w')
+    >>> foo = source.create_group('foo')
+    >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
+    >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
+    >>> zarr.tree(source)
+    /
+     ├── foo
+     │   └── bar
+     │       └── baz (100,) int64
+     └── spam (100,) int64
+    >>> dest = zarr.open_group('data/example.zarr', mode='w')
+    >>> from sys import stdout
+    >>> zarr.copy(source['foo'], dest, log=stdout)
+    copy /foo
+    copy /foo/bar
+    copy /foo/bar/baz (100,) int64
+    all done: 3 copied, 0 skipped, 800 bytes copied
+    (3, 0, 800)
+    >>> dest.tree()  # N.B., no spam
+    /
+     └── foo
+         └── bar
+             └── baz (100,) int64
+    >>> source.close()
+
+If rather than copying a single group or dataset you would like to copy all
+groups and datasets, use :func:`zarr.convenience.copyall`, e.g.::
+
+    >>> source = h5py.File('data/example.h5', mode='r')
+    >>> dest = zarr.open_group('data/example2.zarr', mode='w')
+    >>> zarr.copy_all(source, dest, log=stdout)
+    copy /foo
+    copy /foo/bar
+    copy /foo/bar/baz (100,) int64
+    copy /spam (100,) int64
+    all done: 4 copied, 0 skipped, 1,600 bytes copied
+    (4, 0, 1600)
+    >>> dest.tree()
+    /
+     ├── foo
+     │   └── bar
+     │       └── baz (100,) int64
+     └── spam (100,) int64
+
+If you need to copy data between two Zarr groups, the
+func:`zarr.convenience.copy` and :func:`zarr.convenience.copy_all` functions can
+be used and provide the most flexibility. However, if you want to copy data
+in the most efficient way possible, without changing any configuration options,
+the :func:`zarr.convenience.copy_store` function can be used. This function
+copies data directly between the underlying stores, without any decompression or
+re-compression, and so should be faster. E.g.::
+
+    >>> import zarr
+    >>> import numpy as np
+    >>> store1 = zarr.DirectoryStore('data/example.zarr')
+    >>> root = zarr.group(store1, overwrite=True)
+    >>> baz = root.create_dataset('foo/bar/baz', data=np.arange(100), chunks=(50,))
+    >>> spam = root.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
+    >>> root.tree()
+    /
+     ├── foo
+     │   └── bar
+     │       └── baz (100,) int64
+     └── spam (100,) int64
+    >>> from sys import stdout
+    >>> store2 = zarr.ZipStore('data/example.zip', mode='w')
+    >>> zarr.copy_store(store1, store2, log=stdout)
+    copy .zgroup
+    copy foo/.zgroup
+    copy foo/bar/.zgroup
+    copy foo/bar/baz/.zarray
+    copy foo/bar/baz/0
+    copy foo/bar/baz/1
+    copy spam/.zarray
+    copy spam/0
+    copy spam/1
+    copy spam/2
+    copy spam/3
+    all done: 11 copied, 0 skipped, 1,138 bytes copied
+    (11, 0, 1138)
+    >>> new_root = zarr.group(store2)
+    >>> new_root.tree()
+    /
+     ├── foo
+     │   └── bar
+     │       └── baz (100,) int64
+     └── spam (100,) int64
+    >>> new_root['foo/bar/baz'][:]
+    array([ 0,  1,  2,  ..., 97, 98, 99])
+    >>> store2.close()  # zip stores need to be closed
+
 .. _tutorial_strings:
 
 String arrays
diff --git a/zarr/__init__.py b/zarr/__init__.py
@@ -13,4 +13,5 @@
 from zarr.codecs import *
 from zarr.convenience import (open, save, save_array, save_group, load, copy_store,
                               copy, copy_all, tree)
+from zarr.errors import CopyError, MetadataError, PermissionError
 from zarr.version import version as __version__
diff --git a/zarr/convenience.py b/zarr/convenience.py
@@ -684,32 +684,70 @@ def copy(source, dest, name=None, shallow=False, without_attrs=False, log=None,
 
     Examples
     --------
-    >>> import h5py
-    >>> import zarr
-    >>> import numpy as np
-    >>> source = h5py.File('data/example.h5', mode='w')
-    >>> foo = source.create_group('foo')
-    >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
-    >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
-    >>> zarr.tree(source)
-    /
-     ├── foo
-     │   └── bar
-     │       └── baz (100,) int64
-     └── spam (100,) int64
-    >>> dest = zarr.group()
-    >>> from sys import stdout
-    >>> zarr.copy(source['foo'], dest, log=stdout)
-    copy /foo
-    copy /foo/bar
-    copy /foo/bar/baz (100,) int64
-    all done: 3 copied, 0 skipped, 800 bytes copied
-    (3, 0, 800)
-    >>> dest.tree()  # N.B., no spam
-    /
-     └── foo
-         └── bar
-             └── baz (100,) int64
+    Here's an example of copying a group named 'foo' from an HDF5 file to a
+    Zarr group::
+
+        >>> import h5py
+        >>> import zarr
+        >>> import numpy as np
+        >>> source = h5py.File('data/example.h5', mode='w')
+        >>> foo = source.create_group('foo')
+        >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
+        >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
+        >>> zarr.tree(source)
+        /
+         ├── foo
+         │   └── bar
+         │       └── baz (100,) int64
+         └── spam (100,) int64
+        >>> dest = zarr.group()
+        >>> from sys import stdout
+        >>> zarr.copy(source['foo'], dest, log=stdout)
+        copy /foo
+        copy /foo/bar
+        copy /foo/bar/baz (100,) int64
+        all done: 3 copied, 0 skipped, 800 bytes copied
+        (3, 0, 800)
+        >>> dest.tree()  # N.B., no spam
+        /
+         └── foo
+             └── bar
+                 └── baz (100,) int64
+        >>> source.close()
+
+    The ``if_exists`` parameter provides options for how to handle pre-existing data in
+    the destination. Here are some examples of these options, also using
+    ``dry_run=True`` to find out what would happen without actually copying anything::
+
+        >>> source = zarr.group()
+        >>> dest = zarr.group()
+        >>> baz = source.create_dataset('foo/bar/baz', data=np.arange(100))
+        >>> spam = source.create_dataset('foo/spam', data=np.arange(1000))
+        >>> existing_spam = dest.create_dataset('foo/spam', data=np.arange(1000))
+        >>> from sys import stdout
+        >>> try:
+        ...     zarr.copy(source['foo'], dest, log=stdout, dry_run=True)
+        ... except zarr.CopyError as e:
+        ...     print(e)
+        ...
+        copy /foo
+        copy /foo/bar
+        copy /foo/bar/baz (100,) int64
+        an object 'spam' already exists in destination '/foo'
+        >>> zarr.copy(source['foo'], dest, log=stdout, if_exists='replace', dry_run=True)
+        copy /foo
+        copy /foo/bar
+        copy /foo/bar/baz (100,) int64
+        copy /foo/spam (1000,) int64
+        dry run: 4 copied, 0 skipped
+        (4, 0, 0)
+        >>> zarr.copy(source['foo'], dest, log=stdout, if_exists='skip', dry_run=True)
+        copy /foo
+        copy /foo/bar
+        copy /foo/bar/baz (100,) int64
+        skip /foo/spam (1000,) int64
+        dry run: 3 copied, 1 skipped
+        (3, 1, 0)
 
     """
 
@@ -978,6 +1016,7 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None,
      │   └── bar
      │       └── baz (100,) int64
      └── spam (100,) int64
+    >>> source.close()
 
     """