Skip to content

Commit 184a64d

Browse files
committed
fix ZipStore performance; resolves #66
1 parent 96c34bb commit 184a64d

File tree

8 files changed

+774
-47
lines changed

8 files changed

+774
-47
lines changed

docs/api/storage.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@ can be used as a Zarr array store.
1313
.. autoclass:: DirectoryStore
1414
.. autoclass:: ZipStore
1515

16+
.. automethod:: close
17+
.. automethod:: flush
18+
1619
.. autofunction:: migrate_1to2

docs/spec/v2.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ Here is the same example using a Zip file as storage::
442442
>>> sub_grp = root_grp.create_group('foo')
443443
>>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10))
444444
>>> a[:] = 42
445+
>>> store.close()
445446

446447
What has been stored::
447448

docs/tutorial.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ Here is an example storing an array directly into a Zip file::
515515
nbytes: 3.8M; nbytes_stored: 21.8K; ratio: 179.2; initialized: 100/100
516516
compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
517517
store: ZipStore
518+
>>> store.close()
518519
>>> import os
519520
>>> os.path.getsize('example.zip')
520521
30721
@@ -536,12 +537,17 @@ Re-open and check that data have been written::
536537
[42, 42, 42, ..., 42, 42, 42],
537538
[42, 42, 42, ..., 42, 42, 42],
538539
[42, 42, 42, ..., 42, 42, 42]], dtype=int32)
540+
>>> store.close()
539541

540542
Note that there are some restrictions on how Zip files can be used,
541543
because items within a Zip file cannot be updated in place. This means
542544
that data in the array should only be written once and write
543545
operations should be aligned with chunk boundaries.
544546

547+
Note also that the ``close()`` method must be called after writing any data to
548+
the store, otherwise essential records will not be written to the underlying
549+
zip file.
550+
545551
The Dask project has implementations of the ``MutableMapping``
546552
interface for distributed storage systems, see the `S3Map
547553
<http://s3fs.readthedocs.io/en/latest/api.html#s3fs.mapping.S3Map>`_
Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [
10+
{
11+
"data": {
12+
"text/plain": [
13+
"'2.0.2.dev0+dirty'"
14+
]
15+
},
16+
"execution_count": 1,
17+
"metadata": {},
18+
"output_type": "execute_result"
19+
}
20+
],
21+
"source": [
22+
"import sys\n",
23+
"sys.path.insert(0, '..')\n",
24+
"import zarr\n",
25+
"zarr.__version__"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 2,
31+
"metadata": {
32+
"collapsed": false
33+
},
34+
"outputs": [
35+
{
36+
"data": {
37+
"text/plain": [
38+
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
39+
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
40+
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
41+
" store: ZipStore"
42+
]
43+
},
44+
"execution_count": 2,
45+
"metadata": {},
46+
"output_type": "execute_result"
47+
}
48+
],
49+
"source": [
50+
"store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',\n",
51+
" mode='r')\n",
52+
"grp = zarr.Group(store)\n",
53+
"z = grp['3L/calldata/genotype']\n",
54+
"z"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 5,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [
64+
{
65+
"name": "stdout",
66+
"output_type": "stream",
67+
"text": [
68+
" 1832 function calls in 0.024 seconds\n",
69+
"\n",
70+
" Ordered by: cumulative time\n",
71+
"\n",
72+
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
73+
" 1 0.000 0.000 0.024 0.024 {built-in method builtins.exec}\n",
74+
" 1 0.000 0.000 0.024 0.024 <string>:1(<module>)\n",
75+
" 1 0.000 0.000 0.024 0.024 core.py:292(__getitem__)\n",
76+
" 20 0.000 0.000 0.023 0.001 core.py:539(_chunk_getitem)\n",
77+
" 20 0.000 0.000 0.020 0.001 core.py:679(_decode_chunk)\n",
78+
" 20 0.000 0.000 0.020 0.001 codecs.py:355(decode)\n",
79+
" 20 0.020 0.001 0.020 0.001 {zarr.blosc.decompress}\n",
80+
" 20 0.000 0.000 0.002 0.000 storage.py:766(__getitem__)\n",
81+
" 20 0.000 0.000 0.001 0.000 zipfile.py:1235(open)\n",
82+
" 20 0.000 0.000 0.001 0.000 zipfile.py:821(read)\n",
83+
" 20 0.000 0.000 0.001 0.000 zipfile.py:901(_read1)\n",
84+
" 80 0.000 0.000 0.001 0.000 zipfile.py:660(read)\n",
85+
" 20 0.000 0.000 0.000 0.000 zipfile.py:854(_update_crc)\n",
86+
" 40 0.000 0.000 0.000 0.000 {built-in method zlib.crc32}\n",
87+
" 80 0.000 0.000 0.000 0.000 {method 'read' of '_io.BufferedReader' objects}\n",
88+
" 20 0.000 0.000 0.000 0.000 zipfile.py:937(_read2)\n",
89+
" 80 0.000 0.000 0.000 0.000 core.py:390(<genexpr>)\n",
90+
" 20 0.000 0.000 0.000 0.000 zipfile.py:953(close)\n",
91+
" 20 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
92+
" 20 0.000 0.000 0.000 0.000 util.py:106(is_total_slice)\n",
93+
" 20 0.000 0.000 0.000 0.000 zipfile.py:708(__init__)\n",
94+
" 20 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects}\n",
95+
" 20 0.000 0.000 0.000 0.000 core.py:676(_chunk_key)\n",
96+
" 80 0.000 0.000 0.000 0.000 {method 'seek' of '_io.BufferedReader' objects}\n",
97+
" 20 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.frombuffer}\n",
98+
" 80 0.000 0.000 0.000 0.000 core.py:398(<genexpr>)\n",
99+
" 20 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects}\n",
100+
" 20 0.000 0.000 0.000 0.000 core.py:386(<listcomp>)\n",
101+
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n",
102+
" 40 0.000 0.000 0.000 0.000 util.py:121(<genexpr>)\n",
103+
" 231 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n",
104+
" 20 0.000 0.000 0.000 0.000 cp437.py:14(decode)\n",
105+
" 80 0.000 0.000 0.000 0.000 {method 'tell' of '_io.BufferedReader' objects}\n",
106+
" 20 0.000 0.000 0.000 0.000 zipfile.py:667(close)\n",
107+
" 20 0.000 0.000 0.000 0.000 {built-in method _struct.unpack}\n",
108+
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.max}\n",
109+
" 20 0.000 0.000 0.000 0.000 {function ZipExtFile.close at 0x7f8cd5ca2048}\n",
110+
" 20 0.000 0.000 0.000 0.000 zipfile.py:1194(getinfo)\n",
111+
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.min}\n",
112+
" 20 0.000 0.000 0.000 0.000 threading.py:1224(current_thread)\n",
113+
" 20 0.000 0.000 0.000 0.000 zipfile.py:654(__init__)\n",
114+
" 1 0.000 0.000 0.000 0.000 util.py:195(get_chunk_range)\n",
115+
" 20 0.000 0.000 0.000 0.000 {built-in method _codecs.charmap_decode}\n",
116+
" 1 0.000 0.000 0.000 0.000 util.py:166(normalize_array_selection)\n",
117+
" 1 0.000 0.000 0.000 0.000 util.py:198(<listcomp>)\n",
118+
" 20 0.000 0.000 0.000 0.000 zipfile.py:1715(_fpclose)\n",
119+
" 20 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}\n",
120+
" 63 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n",
121+
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n",
122+
" 2 0.000 0.000 0.000 0.000 util.py:182(<genexpr>)\n",
123+
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n",
124+
" 20 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n",
125+
" 1 0.000 0.000 0.000 0.000 util.py:130(normalize_axis_selection)\n",
126+
" 20 0.000 0.000 0.000 0.000 zipfile.py:636(_get_decompressor)\n",
127+
" 20 0.000 0.000 0.000 0.000 threading.py:1298(main_thread)\n",
128+
" 4 0.000 0.000 0.000 0.000 core.py:373(<genexpr>)\n",
129+
" 3 0.000 0.000 0.000 0.000 util.py:187(<genexpr>)\n",
130+
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n",
131+
"\n",
132+
"\n"
133+
]
134+
}
135+
],
136+
"source": [
137+
"import cProfile\n",
138+
"cProfile.run('z[:10]', sort='cumtime')"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": 6,
144+
"metadata": {
145+
"collapsed": false
146+
},
147+
"outputs": [
148+
{
149+
"data": {
150+
"text/plain": [
151+
"'0.11.0'"
152+
]
153+
},
154+
"execution_count": 6,
155+
"metadata": {},
156+
"output_type": "execute_result"
157+
}
158+
],
159+
"source": [
160+
"import dask\n",
161+
"import dask.array as da\n",
162+
"dask.__version__"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 7,
168+
"metadata": {
169+
"collapsed": false
170+
},
171+
"outputs": [
172+
{
173+
"data": {
174+
"text/plain": [
175+
"dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
176+
]
177+
},
178+
"execution_count": 7,
179+
"metadata": {},
180+
"output_type": "execute_result"
181+
}
182+
],
183+
"source": [
184+
"d = da.from_array(z, chunks=z.chunks)\n",
185+
"d"
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": 8,
191+
"metadata": {
192+
"collapsed": false
193+
},
194+
"outputs": [
195+
{
196+
"name": "stdout",
197+
"output_type": "stream",
198+
"text": [
199+
"CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s\n",
200+
"Wall time: 29.5 s\n"
201+
]
202+
},
203+
{
204+
"data": {
205+
"text/plain": [
206+
"array([[3, 0],\n",
207+
" [1, 0],\n",
208+
" [2, 0],\n",
209+
" ..., \n",
210+
" [2, 8],\n",
211+
" [8, 8],\n",
212+
" [0, 1]])"
213+
]
214+
},
215+
"execution_count": 8,
216+
"metadata": {},
217+
"output_type": "execute_result"
218+
}
219+
],
220+
"source": [
221+
"%time d.sum(axis=1).compute()"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": 9,
227+
"metadata": {
228+
"collapsed": false
229+
},
230+
"outputs": [
231+
{
232+
"data": {
233+
"text/plain": [
234+
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
235+
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
236+
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
237+
" store: DirectoryStore"
238+
]
239+
},
240+
"execution_count": 9,
241+
"metadata": {},
242+
"output_type": "execute_result"
243+
}
244+
],
245+
"source": [
246+
"# compare with same data via directory store\n",
247+
"store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')\n",
248+
"grp_dir = zarr.Group(store_dir)\n",
249+
"z_dir = grp_dir['3L/calldata/genotype']\n",
250+
"z_dir"
251+
]
252+
},
253+
{
254+
"cell_type": "code",
255+
"execution_count": 10,
256+
"metadata": {
257+
"collapsed": false
258+
},
259+
"outputs": [
260+
{
261+
"data": {
262+
"text/plain": [
263+
"dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
264+
]
265+
},
266+
"execution_count": 10,
267+
"metadata": {},
268+
"output_type": "execute_result"
269+
}
270+
],
271+
"source": [
272+
"d_dir = da.from_array(z_dir, chunks=z_dir.chunks)\n",
273+
"d_dir"
274+
]
275+
},
276+
{
277+
"cell_type": "code",
278+
"execution_count": 11,
279+
"metadata": {
280+
"collapsed": false
281+
},
282+
"outputs": [
283+
{
284+
"name": "stdout",
285+
"output_type": "stream",
286+
"text": [
287+
"CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s\n",
288+
"Wall time: 31.1 s\n"
289+
]
290+
},
291+
{
292+
"data": {
293+
"text/plain": [
294+
"array([[3, 0],\n",
295+
" [1, 0],\n",
296+
" [2, 0],\n",
297+
" ..., \n",
298+
" [2, 8],\n",
299+
" [8, 8],\n",
300+
" [0, 1]])"
301+
]
302+
},
303+
"execution_count": 11,
304+
"metadata": {},
305+
"output_type": "execute_result"
306+
}
307+
],
308+
"source": [
309+
"%time d_dir.sum(axis=1).compute()"
310+
]
311+
},
312+
{
313+
"cell_type": "code",
314+
"execution_count": null,
315+
"metadata": {
316+
"collapsed": true
317+
},
318+
"outputs": [],
319+
"source": []
320+
}
321+
],
322+
"metadata": {
323+
"kernelspec": {
324+
"display_name": "Python 3",
325+
"language": "python",
326+
"name": "python3"
327+
},
328+
"language_info": {
329+
"codemirror_mode": {
330+
"name": "ipython",
331+
"version": 3
332+
},
333+
"file_extension": ".py",
334+
"mimetype": "text/x-python",
335+
"name": "python",
336+
"nbconvert_exporter": "python",
337+
"pygments_lexer": "ipython3",
338+
"version": "3.5.1"
339+
}
340+
},
341+
"nbformat": 4,
342+
"nbformat_minor": 1
343+
}

0 commit comments

Comments
 (0)