@@ -802,6 +802,110 @@ raise an issue on the GitHub issue tracker with any profiling data you can provi
802
802
there may be opportunities to optimise further either within Zarr or within the mapping
803
803
interface to the storage.
804
804
805
+ .. _tutorial_copy :
806
+
807
+ Copying/migrating data
808
+ ----------------------
809
+
810
+ If you have some data in an HDF5 file and would like to copy some or all of it
811
+ into a Zarr group, or vice-versa, the :func: `zarr.convenience.copy ` and
812
+ :func: `zarr.convenience.copy_all ` functions can be used. Here's an example
813
+ copying a group named 'foo' from an HDF5 file to a Zarr group::
814
+
815
+ >>> import h5py
816
+ >>> import zarr
817
+ >>> import numpy as np
818
+ >>> source = h5py.File('data/example.h5', mode='w')
819
+ >>> foo = source.create_group('foo')
820
+ >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
821
+ >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
822
+ >>> zarr.tree(source)
823
+ /
824
+ ├── foo
825
+ │ └── bar
826
+ │ └── baz (100,) int64
827
+ └── spam (100,) int64
828
+ >>> dest = zarr.open_group('data/example.zarr', mode='w')
829
+ >>> from sys import stdout
830
+ >>> zarr.copy(source['foo'], dest, log=stdout)
831
+ copy /foo
832
+ copy /foo/bar
833
+ copy /foo/bar/baz (100,) int64
834
+ all done: 3 copied, 0 skipped, 800 bytes copied
835
+ (3, 0, 800)
836
+ >>> dest.tree() # N.B., no spam
837
+ /
838
+ └── foo
839
+ └── bar
840
+ └── baz (100,) int64
841
+ >>> source.close()
842
+
843
+ If rather than copying a single group or dataset you would like to copy all
844
+ groups and datasets, use :func: `zarr.convenience.copy_all `, e.g.::
845
+
846
+ >>> source = h5py.File('data/example.h5', mode='r')
847
+ >>> dest = zarr.open_group('data/example2.zarr', mode='w')
848
+ >>> zarr.copy_all(source, dest, log=stdout)
849
+ copy /foo
850
+ copy /foo/bar
851
+ copy /foo/bar/baz (100,) int64
852
+ copy /spam (100,) int64
853
+ all done: 4 copied, 0 skipped, 1,600 bytes copied
854
+ (4, 0, 1600)
855
+ >>> dest.tree()
856
+ /
857
+ ├── foo
858
+ │ └── bar
859
+ │ └── baz (100,) int64
860
+ └── spam (100,) int64
861
+
862
+ If you need to copy data between two Zarr groups, the
863
+ func:`zarr.convenience.copy ` and :func: `zarr.convenience.copy_all ` functions can
864
+ be used and provide the most flexibility. However, if you want to copy data
865
+ in the most efficient way possible, without changing any configuration options,
866
+ the :func: `zarr.convenience.copy_store ` function can be used. This function
867
+ copies data directly between the underlying stores, without any decompression or
868
+ re-compression, and so should be faster. E.g.::
869
+
870
+ >>> import zarr
871
+ >>> import numpy as np
872
+ >>> store1 = zarr.DirectoryStore('data/example.zarr')
873
+ >>> root = zarr.group(store1, overwrite=True)
874
+ >>> baz = root.create_dataset('foo/bar/baz', data=np.arange(100), chunks=(50,))
875
+ >>> spam = root.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
876
+ >>> root.tree()
877
+ /
878
+ ├── foo
879
+ │ └── bar
880
+ │ └── baz (100,) int64
881
+ └── spam (100,) int64
882
+ >>> from sys import stdout
883
+ >>> store2 = zarr.ZipStore('data/example.zip', mode='w')
884
+ >>> zarr.copy_store(store1, store2, log=stdout)
885
+ copy .zgroup
886
+ copy foo/.zgroup
887
+ copy foo/bar/.zgroup
888
+ copy foo/bar/baz/.zarray
889
+ copy foo/bar/baz/0
890
+ copy foo/bar/baz/1
891
+ copy spam/.zarray
892
+ copy spam/0
893
+ copy spam/1
894
+ copy spam/2
895
+ copy spam/3
896
+ all done: 11 copied, 0 skipped, 1,138 bytes copied
897
+ (11, 0, 1138)
898
+ >>> new_root = zarr.group(store2)
899
+ >>> new_root.tree()
900
+ /
901
+ ├── foo
902
+ │ └── bar
903
+ │ └── baz (100,) int64
904
+ └── spam (100,) int64
905
+ >>> new_root['foo/bar/baz'][:]
906
+ array([ 0, 1, 2, ..., 97, 98, 99])
907
+ >>> store2.close() # zip stores need to be closed
908
+
805
909
.. _tutorial_strings :
806
910
807
911
String arrays
0 commit comments