Skip to content

Commit 6002b0f

Browse files
committed
copy chunk by chunk
1 parent fd8d96a commit 6002b0f

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

zarr/convenience.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
from collections import Mapping
55
import io
66
import re
7+
import itertools
8+
9+
10+
import numpy as np
711

812

913
from zarr.core import Array
@@ -611,6 +615,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, overwrite, **cr
611615
else:
612616
if dest_h5py:
613617
# zarr -> h5py; use some vaguely sensible defaults
618+
kws.setdefault('chunks', True)
614619
kws.setdefault('compression', 'gzip')
615620
kws.setdefault('compression_opts', 1)
616621
kws.setdefault('shuffle', False)
@@ -621,9 +626,14 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, overwrite, **cr
621626
# create new dataset in destination
622627
ds = dest.create_dataset(name, shape=source.shape, dtype=source.dtype, **kws)
623628

624-
# copy data - N.B., if dest is h5py this will load all data into memory
629+
# copy data - N.B., go chunk by chunk to avoid loading everything into memory
625630
log('{} -> {}'.format(source.name, ds.name))
626-
ds[:] = source
631+
shape = ds.shape
632+
chunks = ds.chunks
633+
chunk_offsets = [range(0, s, c) for s, c in zip(shape, chunks)]
634+
for offset in itertools.product(*chunk_offsets):
635+
sel = tuple(slice(o, min(s, o + c)) for o, s, c in zip(offset, shape, chunks))
636+
ds[sel] = source[sel]
627637

628638
# copy attributes
629639
if not without_attrs:

zarr/tests/test_convenience.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,8 @@ def _test_copy(new_source, new_dest):
274274
foo.attrs['experiment'] = 'weird science'
275275
baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,))
276276
baz.attrs['units'] = 'metres'
277-
source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,))
277+
source.create_dataset('spam', data=np.arange(100, 200).reshape(20, 5),
278+
chunks=(10, 2))
278279

279280
# copy array with default options
280281
copy(source['foo/bar/baz'], dest)

0 commit comments

Comments
 (0)