Skip to content

Commit 8ad5dab

Browse files
nhz2sjkelly
andauthored
Add read only ZipStore (#123)
* Add zip storage * more testing * close zipfile in python * fix spacing * update version * Update src/Storage/Storage.jl Co-authored-by: Steve Kelly <[email protected]> --------- Co-authored-by: Steve Kelly <[email protected]>
1 parent 1dda5f1 commit 8ad5dab

File tree

7 files changed

+248
-3
lines changed

7 files changed

+248
-3
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1717
OpenSSL = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
1818
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
1919
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
20+
ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c"
2021

2122
[compat]
2223
AWSS3 = "0.10"
@@ -30,6 +31,7 @@ LRUCache = "1"
3031
OffsetArrays = "0.11, 1.0"
3132
OpenSSL = "1"
3233
URIs = "1"
34+
ZipArchives = "1"
3335
julia = "1.2"
3436

3537
[extras]

src/Storage/Storage.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ abstract type AbstractStore end
55

66
#Define the interface
77
"""
8-
storagesize(d::AbstractStore)
8+
storagesize(d::AbstractStore, p::AbstractString)
99
10-
This function shall return the size of all data files in a store.
10+
This function shall return the size of all data files in a store at path `p`.
1111
"""
1212
function storagesize end
1313

@@ -168,3 +168,4 @@ include("s3store.jl")
168168
include("gcstore.jl")
169169
include("consolidated.jl")
170170
include("http.jl")
171+
include("zipstore.jl")

src/Storage/zipstore.jl

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import ZipArchives
2+
3+
"""
4+
ZipStore
5+
6+
A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file.
7+
"""
8+
struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore
9+
r::ZipArchives.ZipBufferReader{T}
10+
end
11+
12+
13+
ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data))
14+
15+
Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage")
16+
17+
function Base.getindex(d::ZipStore, k::AbstractString)::Union{Nothing, Vector{UInt8}}
18+
i = ZipArchives.zip_findlast_entry(d.r, k)
19+
if isnothing(i)
20+
nothing
21+
else
22+
ZipArchives.zip_readentry(d.r, i)
23+
end
24+
end
25+
26+
_make_prefix(p)::String =(isempty(p) || endswith(p,'/')) ? p : p*'/'
27+
28+
function storagesize(d::ZipStore, p)::Int64
29+
prefix::String = _make_prefix(p)
30+
s::Int128 = Int128(0)
31+
for i in 1:ZipArchives.zip_nentries(d.r)
32+
name = ZipArchives.zip_name(d.r, i)
33+
if startswith(name, prefix)
34+
filename = last(split(name, '/'))
35+
if !in(filename,(".zattrs",".zarray",".zgroup"))
36+
s += ZipArchives.zip_uncompressed_size(d.r, i)
37+
end
38+
end
39+
end
40+
s
41+
end
42+
43+
function subdirs(d::ZipStore, p)::Vector{String}
44+
prefix::String = _make_prefix(p)
45+
o = Set{String}()
46+
for i in 1:ZipArchives.zip_nentries(d.r)
47+
name = ZipArchives.zip_name(d.r, i)
48+
if startswith(name, prefix) && !endswith(name, '/')
49+
chopped_name = SubString(name, 1+ncodeunits(prefix))
50+
if '/' chopped_name
51+
push!(o, first(split(chopped_name, '/')))
52+
end
53+
end
54+
end
55+
collect(o)
56+
end
57+
function subkeys(d::ZipStore, p)::Vector{String}
58+
prefix::String = _make_prefix(p)
59+
o = Set{String}()
60+
for i in 1:ZipArchives.zip_nentries(d.r)
61+
name = ZipArchives.zip_name(d.r, i)
62+
if startswith(name, prefix) && !endswith(name, '/')
63+
chopped_name = SubString(name, 1+ncodeunits(prefix))
64+
if '/' chopped_name
65+
push!(o, chopped_name)
66+
end
67+
end
68+
end
69+
collect(o)
70+
end
71+
72+
# Zip archives are generally append only
73+
# so it doesn't quite work to make ZipStore writable.
74+
# The idea is if you want a zipfile, you should first use one of the
75+
# regular mutable stores, then save it to a zip archive.
76+
"""
77+
writezip(io::IO, s::AbstractStore, p)
78+
79+
Write an AbstractStore to an IO as a zip archive.
80+
"""
81+
function writezip(io::IO, s::AbstractStore, p=""; kwargs...)
82+
ZipArchives.ZipWriter(io; kwargs...) do w
83+
_writezip(w, s, String(p))
84+
end
85+
end
86+
function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String)
87+
for subkey in subkeys(s, p)
88+
fullname = _make_prefix(p)*subkey
89+
data = getindex(s, fullname)
90+
if !isnothing(data)
91+
ZipArchives.zip_writefile(w, fullname, data)
92+
end
93+
end
94+
for subdir in subdirs(s, p)
95+
_writezip(w, s, _make_prefix(p)*subdir)
96+
end
97+
end

src/ZGroup.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ function zcreate(::Type{T},g::ZGroup, name::AbstractString, addargs...; kwargs..
156156
end
157157

158158
HTTP.serve(s::Union{ZArray,ZGroup}, args...; kwargs...) = HTTP.serve(s.storage, s.path, args...; kwargs...)
159+
writezip(io::IO, s::Union{ZArray,ZGroup}; kwargs...) = writezip(io, s.storage, s.path; kwargs...)
159160
function consolidate_metadata(z::Union{ZArray,ZGroup})
160161
z.writeable || throw(Base.IOError("Zarr group is not writeable. Please re-open in write mode to create an array",0))
161162
consolidate_metadata(z.storage,z.path)

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
33
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
44
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
55
Minio = "4281f0d9-7ae0-406e-9172-b7277c1efa20"
6+
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
67
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
78
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
89
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

test/python.jl

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
###
66
@testset "Python zarr implementation" begin
77

8+
import Mmap
89
using PyCall
910
import PyCall: @py_str
1011
#If we are on conda, import zarr
@@ -48,10 +49,16 @@ for t in dtypes, co in compressors
4849
a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp)
4950
a[] = testzerodimarrays[t]
5051
end
52+
#Also save as zip file.
53+
open(pjulia*".zip";write=true) do io
54+
Zarr.writezip(io, g)
55+
end
56+
5157
# Test reading in python
58+
for julia_path in (pjulia, pjulia*".zip")
5259
py"""
5360
import zarr
54-
g = zarr.open_group($pjulia)
61+
g = zarr.open_group($julia_path)
5562
gatts = g.attrs
5663
"""
5764

@@ -111,6 +118,10 @@ for i=1:length(dtypes), co in compressors
111118
@test py"ar.shape" == ()
112119
@test convert(t, py"ar[()]") == testzerodimarrays[t]
113120
end
121+
py"""
122+
g.store.close()
123+
"""
124+
end
114125

115126
## Now the other way around, we create a zarr array using the python lib and read back into julia
116127
data = rand(Int32,2,6,10)
@@ -160,6 +171,37 @@ a1[:,1,1] = 1:10
160171
@test a1[:,1,1] == 1:10
161172
# Test reading the string array
162173
@test String(g["a2"][:])=="hallo"
174+
175+
176+
# Test zip file can be read
177+
ppythonzip = ppython*".zip"
178+
py"""
179+
import numcodecs
180+
import numpy as np
181+
store = zarr.ZipStore($ppythonzip, mode="w")
182+
g = zarr.group(store=store)
183+
g.attrs["groupatt"] = "Hi"
184+
z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
185+
z1[:,:,:]=$data
186+
z1.attrs["test"]={"b": 6}
187+
z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
188+
z2[:]=[k for k in 'hallo']
189+
z3 = g.create_dataset('a3', shape=(2,), dtype=str)
190+
z3[:]=np.asarray(['test1', 'test234'], dtype='O')
191+
store.close()
192+
"""
193+
194+
g = zopen(Zarr.ZipStore(Mmap.mmap(ppythonzip)))
195+
@test g isa Zarr.ZGroup
196+
@test g.attrs["groupatt"] == "Hi"
197+
a1 = g["a1"]
198+
@test a1 isa ZArray
199+
@test a1[:,:,:]==permutedims(data,(3,2,1))
200+
@test a1.attrs["test"]==Dict("b"=>6)
201+
# Test reading the string array
202+
@test String(g["a2"][:])=="hallo"
203+
@test g["a3"] == ["test1", "test234"]
204+
163205
end
164206

165207
@testset "Python datetime types" begin

test/storage.jl

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,77 @@ function test_store_common(ds)
4949
@test !Zarr.isemptysub(ds,"bar/")
5050
end
5151

52+
"""
53+
Function to test the interface of a read only AbstractStore. Every complete implementation should pass this test.
54+
55+
`converter` is a function that takes a Zarr.DictStore, and converts it to a read only store.
56+
57+
`closer` is a function that gets called to close the read only store.
58+
"""
59+
function test_read_only_store_common(converter, closer=Returns(nothing))
60+
ds = Zarr.DictStore()
61+
rs = converter(ds)
62+
@test !Zarr.is_zgroup(rs,"")
63+
64+
closer(rs)
65+
ds[".zgroup"]=rand(UInt8,50)
66+
rs = converter(ds)
67+
68+
@test haskey(rs,".zgroup")
69+
70+
@test Zarr.is_zgroup(rs,"")
71+
@test !Zarr.is_zarray(rs,"")
72+
73+
@test isempty(Zarr.subdirs(rs,""))
74+
@test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"]
75+
76+
#Create a subgroup
77+
@test !Zarr.is_zarray(rs,"bar")
78+
79+
closer(rs)
80+
ds["bar/.zarray"] = rand(UInt8,50)
81+
rs = converter(ds)
82+
83+
@test Zarr.is_zarray(rs,"bar")
84+
@test Zarr.subdirs(rs,"") == ["bar"]
85+
@test Zarr.subdirs(rs,"bar") == String[]
86+
#Test getindex and setindex
87+
data = rand(UInt8,50)
88+
89+
closer(rs)
90+
ds["bar/0.0.0"] = data
91+
rs = converter(ds)
92+
93+
@test rs["bar/0.0.0"]==data
94+
@test Zarr.storagesize(rs,"bar")==50
95+
@test Zarr.isinitialized(rs,"bar/0.0.0")
96+
@test !Zarr.isinitialized(rs,"bar/0.0.1")
97+
98+
closer(rs)
99+
Zarr.writeattrs(ds,"bar",Dict("a"=>"b"))
100+
rs = converter(ds)
101+
102+
@test Zarr.getattrs(rs,"bar")==Dict("a"=>"b")
103+
104+
closer(rs)
105+
delete!(ds,"bar/0.0.0")
106+
rs = converter(ds)
107+
108+
@test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0)))
109+
@test !Zarr.isinitialized(rs,"bar/0.0.0")
110+
111+
closer(rs)
112+
ds["bar/0.0.0"] = data
113+
rs = converter(ds)
114+
115+
#Add tests for empty storage
116+
@test Zarr.isemptysub(rs,"ba")
117+
@test Zarr.isemptysub(rs,"ba/")
118+
@test !Zarr.isemptysub(rs,"bar")
119+
@test !Zarr.isemptysub(rs,"bar/")
120+
closer(rs)
121+
end
122+
52123
@testset "DirectoryStore" begin
53124
A = fill(1.0, 30, 20)
54125
chunks = (5,10)
@@ -145,6 +216,13 @@ end
145216
@test g2.attrs == Dict("groupatt"=>5)
146217
@test g2["a1"].attrs == Dict("arratt"=>2.5)
147218
@test g2["a1"][:,:] == reshape(1:200,10,20)
219+
220+
# The following test doesn't pass, but maybe should?
221+
# test_read_only_store_common() do ds
222+
# # This converts a DictStore to a read only ConsolidatedStore HTTPStore
223+
# @async HTTP.serve(ds,"",ip,port,server=server)
224+
# Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"),"")
225+
# end
148226
close(server)
149227
#Test server that returns 403 instead of 404 for missing chunks
150228
server = Sockets.listen(0)
@@ -159,3 +237,26 @@ end
159237
@test all(==(-1),g3["a"][:,:])
160238
close(server)
161239
end
240+
241+
@testset "Zip Storage" begin
242+
s = Zarr.DictStore()
243+
g = zgroup(s, attrs = Dict("groupatt"=>5))
244+
a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5))
245+
a .= reshape(1:200,10,20)
246+
io = IOBuffer()
247+
Zarr.writezip(io, g)
248+
data = take!(io)
249+
ds = Zarr.ZipStore(data)
250+
@test sprint(show, ds) == "Read Only Zip Storage"
251+
g2 = zopen(ds)
252+
@test g2.attrs == Dict("groupatt"=>5)
253+
@test g2["a1"].attrs == Dict("arratt"=>2.5)
254+
@test g2["a1"][:,:] == reshape(1:200,10,20)
255+
256+
test_read_only_store_common() do ds
257+
# This converts a DictStore to a read only ZipStore
258+
io = IOBuffer()
259+
Zarr.writezip(io, ds)
260+
Zarr.ZipStore(take!(io))
261+
end
262+
end

0 commit comments

Comments
 (0)