From 0345a1cdef626f6aa42d59f965e49010011abdb4 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 11 Mar 2025 11:45:14 -0400 Subject: [PATCH 01/39] Add dimension separator as a type parameter --- src/Storage/Storage.jl | 19 ++++++----- src/Storage/consolidated.jl | 7 +++-- src/Storage/dictstore.jl | 5 +-- src/Storage/directorystore.jl | 7 +++-- src/Storage/gcstore.jl | 7 +++-- src/Storage/http.jl | 5 +-- src/Storage/s3store.jl | 8 +++-- src/Storage/zipstore.jl | 8 ++--- src/ZArray.jl | 11 +++++-- src/metadata.jl | 59 ++++++++++++++++++++++++++++++----- 10 files changed, 99 insertions(+), 37 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index 46c819a9..e23268c3 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -3,11 +3,13 @@ # and Dictionaries are supported """ - abstract type AbstractStore + abstract type AbstractStore{S} This the abstract supertype for all Zarr store implementations. Currently only regular files ([`DirectoryStore`](@ref)) and Dictionaries are supported. +S is the dimension separator + ## Interface All subtypes of `AbstractStore` must implement the following methods: @@ -24,7 +26,7 @@ They may optionally implement the following methods: - [`store_read_strategy(s::AbstractStore)`](@ref store_read_strategy): return the read strategy for the given store. See [`SequentialRead`](@ref) and [`ConcurrentRead`](@ref). """ -abstract type AbstractStore end +abstract type AbstractStore{S} end #Define the interface """ @@ -70,17 +72,18 @@ function subkeys end Deletes the given key from the store. """ -citostring(i::CartesianIndex) = join(reverse((i - oneunit(i)).I), '.') -citostring(::CartesianIndex{0}) = "0" +citostring(i::CartesianIndex, sep::Char='.') = join(reverse((i - oneunit(i)).I), sep) +citostring(::CartesianIndex{0}, _::Char) = "0" +citostring(i::CartesianIndex, s::AbstractStore{S}) where S = citostring(i, S) _concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s -Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i)] +Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i, s)] Base.getindex(s::AbstractStore, p, i) = s[_concatpath(p,i)] -Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i)) +Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i, s)) Base.delete!(s::AbstractStore, p, i) = delete!(s, _concatpath(p,i)) Base.haskey(s::AbstractStore, k) = isinitialized(s,k) Base.setindex!(s::AbstractStore,v,p,i) = setindex!(s,v,_concatpath(p,i)) -Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i)]=v +Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i, s)]=v maybecopy(x) = copy(x) @@ -111,7 +114,7 @@ end is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) -isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) +isinitialized(s::AbstractStore{S}, p, i::CartesianIndex) where S = isinitialized(s,p,citostring(i, S)) isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) isinitialized(s::AbstractStore, i) = s[i] !== nothing diff --git a/src/Storage/consolidated.jl b/src/Storage/consolidated.jl index 0b28f553..3ab49e8f 100644 --- a/src/Storage/consolidated.jl +++ b/src/Storage/consolidated.jl @@ -3,18 +3,19 @@ A store that wraps any other AbstractStore but has access to the consolidated me stored in the .zmetadata key. Whenever data attributes or metadata are accessed, the data will be read from the dictionary instead. """ -struct ConsolidatedStore{P} <: AbstractStore +struct ConsolidatedStore{S,P} <: AbstractStore{S} parent::P path::String cons::Dict{String,Any} end -function ConsolidatedStore(s::AbstractStore, p) +function ConsolidatedStore{S}(s::AbstractStore, p) where S d = s[p, ".zmetadata"] if d === nothing throw(ArgumentError("Could not find consolidated metadata for store $s")) end - ConsolidatedStore(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) + ConsolidatedStore{S, typeof(s)}(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) end +ConsolidatedStore(s::AbstractStore, p) = ConsolidateStore{'.'}(s, p) function Base.show(io::IO,d::ConsolidatedStore) b = IOBuffer() diff --git a/src/Storage/dictstore.jl b/src/Storage/dictstore.jl index 7815ed20..87f8af11 100644 --- a/src/Storage/dictstore.jl +++ b/src/Storage/dictstore.jl @@ -1,8 +1,9 @@ # Stores data in a simple dict in memory -struct DictStore <: AbstractStore +struct DictStore{S} <: AbstractStore{S} a::Dict{String,Vector{UInt8}} end -DictStore() = DictStore(Dict{String,Vector{UInt8}}()) +DictStore() = DictStore{'.'}(Dict{String,Vector{UInt8}}()) +DictStore{S}() where S = DictStore{S}(Dict{String,Vector{UInt8}}()) Base.show(io::IO,d::DictStore) = print(io,"Dictionary Storage") function _pdict(d::DictStore,p) diff --git a/src/Storage/directorystore.jl b/src/Storage/directorystore.jl index 6ded94fb..55b64e1e 100644 --- a/src/Storage/directorystore.jl +++ b/src/Storage/directorystore.jl @@ -9,12 +9,13 @@ function normalize_path(p::AbstractString) end # Stores files in a regular file system -struct DirectoryStore <: AbstractStore +struct DirectoryStore{S} <: AbstractStore{S} folder::String - function DirectoryStore(p) + function DirectoryStore{S}(p) where S mkpath(normalize_path(p)) - new(normalize_path(p)) + new{S}(normalize_path(p)) end + DirectoryStore(p) = DirectoryStore{'.'}(p) end function Base.getindex(d::DirectoryStore, i::String) diff --git a/src/Storage/gcstore.jl b/src/Storage/gcstore.jl index 5f85820d..8e24cfe6 100644 --- a/src/Storage/gcstore.jl +++ b/src/Storage/gcstore.jl @@ -56,10 +56,10 @@ function _gcs_request_headers() return headers end -struct GCStore <: AbstractStore +struct GCStore{S} <: AbstractStore{S} bucket::String - function GCStore(url::String) + function GCStore{S}(url::String) where S uri = URI(url) if uri.scheme == "gs" @@ -71,6 +71,7 @@ struct GCStore <: AbstractStore @debug "GCS bucket: $bucket" new(bucket) end + GCStore(url::String) = GCStore{'.'}(url) end @@ -147,4 +148,4 @@ function storefromstring(::Type{<:GCStore}, url,_) return GCStore(url),p end -store_read_strategy(::GCStore) = ConcurrentRead(concurrent_io_tasks[]) \ No newline at end of file +store_read_strategy(::GCStore) = ConcurrentRead(concurrent_io_tasks[]) diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 9b68cb14..223d4d4f 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -10,11 +10,12 @@ datasets being served through the [xpublish](https://xpublish.readthedocs.io/en/ python package. In case you experience performance issues, one can try to use `HTTP.set_default_connection_limit!` to increase the number of concurrent connections. """ -struct HTTPStore <: AbstractStore +struct HTTPStore{S} <: AbstractStore{S} url::String allowed_codes::Set{Int} + HTTPStore{S}(url, allowed_codes = Set((404,))) where S = new{S}(url, allowed_codes) end -HTTPStore(url) = HTTPStore(url,Set((404,))) +HTTPStore(url) = HTTPStore{'.'}(url) function Base.getindex(s::HTTPStore, k::String) r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream) diff --git a/src/Storage/s3store.jl b/src/Storage/s3store.jl index aaab004f..cfc2e627 100644 --- a/src/Storage/s3store.jl +++ b/src/Storage/s3store.jl @@ -1,19 +1,21 @@ using AWSS3: AWSS3, s3_put, s3_get, s3_delete, s3_list_objects, s3_exists -struct S3Store <: AbstractStore +struct S3Store{S} <: AbstractStore{S} bucket::String aws::AWSS3.AWS.AbstractAWSConfig end -function S3Store(bucket::String; +function S3Store{S}(bucket::String; aws = nothing, - ) + ) where S if aws === nothing aws = AWSS3.AWS.global_aws_config() end S3Store(bucket, aws) end +S3Store(bucket, aws) = S3Store{'.'}(bucket, aws) +S3Store(bucket; aws = nothing) = S3Store{'.'}(bucket, aws) Base.show(io::IO,::S3Store) = print(io,"S3 Object Storage") diff --git a/src/Storage/zipstore.jl b/src/Storage/zipstore.jl index 8e8bbd27..9fd3ca25 100644 --- a/src/Storage/zipstore.jl +++ b/src/Storage/zipstore.jl @@ -5,12 +5,12 @@ import ZipArchives A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file. """ -struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore +struct ZipStore{S, T <: AbstractVector{UInt8}} <: AbstractStore{S} r::ZipArchives.ZipBufferReader{T} + ZipStore{S}(data::AbstractVector{UInt8}) where S = new{S, ZipArchives.ZipBufferReader}(ZipArchives.ZipBufferReader(data)) end - -ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data)) +ZipStore(data::AbstractVector{UInt8}) = ZipStore{'.'}(ZipArchives.ZipBufferReader(data)) Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage") @@ -94,4 +94,4 @@ function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String) for subdir in subdirs(s, p) _writezip(w, s, _make_prefix(p)*subdir) end -end \ No newline at end of file +end diff --git a/src/ZArray.jl b/src/ZArray.jl index b0955687..de5a8ecd 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -311,6 +311,7 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `attrs=Dict()` a dict containing key-value pairs with metadata attributes associated to the array * `writeable=true` determines if the array is opened in read-only or write mode * `indent_json=false` determines if indents are added to format the json files `.zarray` and `.zattrs`. This makes them more readable, but increases file size. +* `dimension_separator='.'` sets how chunks are encoded. The Zarr v2 default is '.' such that the first 3D chunk would be `0.0.0`. The Zarr v3 default is `/`. """ function zcreate(::Type{T}, dims::Integer...; name="", @@ -335,14 +336,20 @@ function zcreate(::Type{T},storage::AbstractStore, filters = filterfromtype(T), attrs=Dict(), writeable=true, - indent_json=false + indent_json=false, + dimension_separator='.' ) where T + + if dimension_separator isa AbstractString + # Convert AbstractString to Char + dimension_separator = only(dimension_separator) + end length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) C = typeof(compressor) T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - metadata = Metadata{T2, N, C, typeof(filters)}( + metadata = Metadata{T2, N, C, typeof(filters), dimension_separator}( 2, dims, chunks, diff --git a/src/metadata.jl b/src/metadata.jl index d80e7c13..bfadfb03 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -91,9 +91,18 @@ Each array requires essential configuration metadata to be stored, enabling corr interpretation of the stored data. This metadata is encoded using JSON and stored as the value of the “.zarray” key within an array store. +# Type Parameters +* T - element type of the array +* N - dimensionality of the array +* C - compressor +* F - filters +* S - dimension separator + +# See Also + https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ -struct Metadata{T, N, C, F} +struct Metadata{T, N, C, F, S} zarr_format::Int shape::Base.RefValue{NTuple{N, Int}} chunks::NTuple{N, Int} @@ -102,15 +111,46 @@ struct Metadata{T, N, C, F} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function Metadata{T2, N, C, F}(zarr_format, shape, chunks, dtype, compressor,fill_value, order, filters) where {T2,N,C,F} + function Metadata{T2, N, C, F, S}(zarr_format, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} #We currently only support version zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2, N, C, F, S}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) end + function Metadata{T2, N, C, F}( + zarr_format, + shape, + chunks, + dtype, + compressor, + fill_value, + order, + filters, + dimension_separator::Char = '.' + ) where {T2,N,C,F} + return Metadata{T2, N, C, F, dimension_separator}( + zarr_format, + shape, + chunks, + dtype, + compressor, + fill_value, + order + ) + end + +end + +const DimensionSeparatedMetadata{S} = Metadata{<: Any, <: Any, <: Any, <: Any, S} + +function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where S + if name == :dimension_separator + return S + end + return getfield(m, name) end #To make unit tests pass with ref shape @@ -123,7 +163,8 @@ function ==(m1::Metadata, m2::Metadata) m1.compressor == m2.compressor && m1.fill_value == m2.fill_value && m1.order == m2.order && - m1.filters == m2.filters + m1.filters == m2.filters && + m1.dimension_separator == m2.dimension_separator end @@ -135,9 +176,10 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; order::Char='C', filters::Nothing=nothing, fill_as_missing = false, + dimension_separator::Char = '.' ) where {T, N, C} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{T2, N, C, typeof(filters)}( + Metadata{T2, N, C, typeof(filters), dimension_separator}( zarr_format, size(A), chunks, @@ -175,7 +217,9 @@ function Metadata(d::AbstractDict, fill_as_missing) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{TU, N, C, F}( + S = only(get(d, "dimension_separator", '.')) + + Metadata{TU, N, C, F, S}( d["zarr_format"], NTuple{N, Int}(d["shape"]) |> reverse, NTuple{N, Int}(d["chunks"]) |> reverse, @@ -197,7 +241,8 @@ function JSON.lower(md::Metadata) "compressor" => md.compressor, "fill_value" => fill_value_encoding(md.fill_value), "order" => md.order, - "filters" => md.filters + "filters" => md.filters, + "dimension_separator" => md.dimension_separator ) end From 61786e3661c9c3d017fe0748e72f3e651e0566d3 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 12 Mar 2025 17:24:42 -0400 Subject: [PATCH 02/39] Fix ZipStore constructor --- src/Storage/zipstore.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storage/zipstore.jl b/src/Storage/zipstore.jl index 9fd3ca25..f8a68f7a 100644 --- a/src/Storage/zipstore.jl +++ b/src/Storage/zipstore.jl @@ -8,9 +8,9 @@ A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file struct ZipStore{S, T <: AbstractVector{UInt8}} <: AbstractStore{S} r::ZipArchives.ZipBufferReader{T} ZipStore{S}(data::AbstractVector{UInt8}) where S = new{S, ZipArchives.ZipBufferReader}(ZipArchives.ZipBufferReader(data)) + ZipStore(data::AbstractVector{UInt8}) = ZipStore{'.'}(data) end -ZipStore(data::AbstractVector{UInt8}) = ZipStore{'.'}(ZipArchives.ZipBufferReader(data)) Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage") From cbb23cebaed913929e8a343b8fbabf64d6ca02c1 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 12 Mar 2025 17:25:48 -0400 Subject: [PATCH 03/39] Fix ConsolidatedStore --- src/Storage/consolidated.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storage/consolidated.jl b/src/Storage/consolidated.jl index 3ab49e8f..fa004033 100644 --- a/src/Storage/consolidated.jl +++ b/src/Storage/consolidated.jl @@ -15,7 +15,7 @@ function ConsolidatedStore{S}(s::AbstractStore, p) where S end ConsolidatedStore{S, typeof(s)}(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) end -ConsolidatedStore(s::AbstractStore, p) = ConsolidateStore{'.'}(s, p) +ConsolidatedStore(s::AbstractStore, p) = ConsolidatedStore{'.'}(s, p) function Base.show(io::IO,d::ConsolidatedStore) b = IOBuffer() From e4630a9899914d8ad5353f4301333edc026cf69f Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 12 Mar 2025 17:27:38 -0400 Subject: [PATCH 04/39] Fix S3Store constructor --- src/Storage/s3store.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storage/s3store.jl b/src/Storage/s3store.jl index cfc2e627..f6afd1c2 100644 --- a/src/Storage/s3store.jl +++ b/src/Storage/s3store.jl @@ -15,7 +15,7 @@ function S3Store{S}(bucket::String; S3Store(bucket, aws) end S3Store(bucket, aws) = S3Store{'.'}(bucket, aws) -S3Store(bucket; aws = nothing) = S3Store{'.'}(bucket, aws) +S3Store(bucket; aws = nothing) = S3Store{'.'}(bucket; aws) Base.show(io::IO,::S3Store) = print(io,"S3 Object Storage") From b9e175fb5915b922d622ae787bf7167ee81d1829 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Thu, 13 Mar 2025 03:08:39 -0400 Subject: [PATCH 05/39] Add version as a type parameter --- src/Storage/Storage.jl | 27 ++++++++++++++++++----- src/Storage/consolidated.jl | 13 +++++++---- src/Storage/dictstore.jl | 7 +++--- src/Storage/directorystore.jl | 9 ++++---- src/Storage/gcstore.jl | 7 +++--- src/Storage/http.jl | 7 +++--- src/Storage/s3store.jl | 14 +++++++----- src/Storage/zipstore.jl | 7 +++--- test/runtests.jl | 12 +++++----- test/storage.jl | 41 ++++++++++++++++++++++++++++------- 10 files changed, 99 insertions(+), 45 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index e23268c3..9a056b57 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -2,12 +2,27 @@ # Defines different storages for zarr arrays. Currently only regular files (DirectoryStore) # and Dictionaries are supported +# Default Zarr version +const DV = 2 + +# Default Zarr separator + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(version) = version == 2 ? DS2 : DS3 +const DS = default_sep(DV) + """ - abstract type AbstractStore{S} + abstract type AbstractStore{V,S} This the abstract supertype for all Zarr store implementations. Currently only regular files ([`DirectoryStore`](@ref)) and Dictionaries are supported. +# Type Parameters +V is the version, either 2 or 3 S is the dimension separator ## Interface @@ -26,7 +41,7 @@ They may optionally implement the following methods: - [`store_read_strategy(s::AbstractStore)`](@ref store_read_strategy): return the read strategy for the given store. See [`SequentialRead`](@ref) and [`ConcurrentRead`](@ref). """ -abstract type AbstractStore{S} end +abstract type AbstractStore{V,S} end #Define the interface """ @@ -72,9 +87,9 @@ function subkeys end Deletes the given key from the store. """ -citostring(i::CartesianIndex, sep::Char='.') = join(reverse((i - oneunit(i)).I), sep) -citostring(::CartesianIndex{0}, _::Char) = "0" -citostring(i::CartesianIndex, s::AbstractStore{S}) where S = citostring(i, S) +@inline citostring(i::CartesianIndex, version::Int=DV, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) +@inline citostring(::CartesianIndex{0}, version::Int=DV, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) +citostring(i::CartesianIndex, s::AbstractStore{V, S}) where {V,S} = citostring(i, V, S) _concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i, s)] @@ -114,7 +129,7 @@ end is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) -isinitialized(s::AbstractStore{S}, p, i::CartesianIndex) where S = isinitialized(s,p,citostring(i, S)) +isinitialized(s::AbstractStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) isinitialized(s::AbstractStore, i) = s[i] !== nothing diff --git a/src/Storage/consolidated.jl b/src/Storage/consolidated.jl index fa004033..2429622c 100644 --- a/src/Storage/consolidated.jl +++ b/src/Storage/consolidated.jl @@ -3,19 +3,24 @@ A store that wraps any other AbstractStore but has access to the consolidated me stored in the .zmetadata key. Whenever data attributes or metadata are accessed, the data will be read from the dictionary instead. """ -struct ConsolidatedStore{S,P} <: AbstractStore{S} +struct ConsolidatedStore{V,S,P} <: AbstractStore{V,S} parent::P path::String cons::Dict{String,Any} end -function ConsolidatedStore{S}(s::AbstractStore, p) where S +function ConsolidatedStore{V,S}(s::AbstractStore, p) where {V,S} d = s[p, ".zmetadata"] if d === nothing throw(ArgumentError("Could not find consolidated metadata for store $s")) end - ConsolidatedStore{S, typeof(s)}(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) + ConsolidatedStore{V, S, typeof(s)}(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) end -ConsolidatedStore(s::AbstractStore, p) = ConsolidatedStore{'.'}(s, p) +ConsolidatedStore{V}(s::AbstractStore, p) where V = ConsolidatedStore{V, default_sep(V)}(s, p) +ConsolidatedStore(s::AbstractStore, p) = ConsolidatedStore{DV,DS}(s, p) + +ConsolidatedStore(s::AbstractStore, p, d) = ConsolidatedStore{DV, DS}(s,p,d) +ConsolidatedStore{V}(s::AbstractStore, p, d) where V = ConsolidatedStore{V, default_sep(V)}(s,p,d) +ConsolidatedStore{V,S}(s::AbstractStore, p, d) where {V,S} = ConsolidatedStore{V, default_sep(V), typeof(s)}(s,p,d) function Base.show(io::IO,d::ConsolidatedStore) b = IOBuffer() diff --git a/src/Storage/dictstore.jl b/src/Storage/dictstore.jl index 87f8af11..f6598971 100644 --- a/src/Storage/dictstore.jl +++ b/src/Storage/dictstore.jl @@ -1,9 +1,10 @@ # Stores data in a simple dict in memory -struct DictStore{S} <: AbstractStore{S} +struct DictStore{V,S} <: AbstractStore{V,S} a::Dict{String,Vector{UInt8}} + DictStore{V,S}(a=Dict{String,Vector{UInt8}}()) where {V,S} = new{V,S}(a) end -DictStore() = DictStore{'.'}(Dict{String,Vector{UInt8}}()) -DictStore{S}() where S = DictStore{S}(Dict{String,Vector{UInt8}}()) +DictStore(a=Dict{String,Vector{UInt8}}()) = DictStore{DV,DS}(a) +DictStore{V}(a=Dict{String,Vector{UInt8}}()) where V = DictStore{V, default_sep(V)}(a) Base.show(io::IO,d::DictStore) = print(io,"Dictionary Storage") function _pdict(d::DictStore,p) diff --git a/src/Storage/directorystore.jl b/src/Storage/directorystore.jl index 55b64e1e..0b7f6434 100644 --- a/src/Storage/directorystore.jl +++ b/src/Storage/directorystore.jl @@ -9,13 +9,14 @@ function normalize_path(p::AbstractString) end # Stores files in a regular file system -struct DirectoryStore{S} <: AbstractStore{S} +struct DirectoryStore{V,S} <: AbstractStore{V,S} folder::String - function DirectoryStore{S}(p) where S + function DirectoryStore{V,S}(p) where {V,S} mkpath(normalize_path(p)) - new{S}(normalize_path(p)) + new{V,S}(normalize_path(p)) end - DirectoryStore(p) = DirectoryStore{'.'}(p) + DirectoryStore(p) = DirectoryStore{DV,DS}(p) + DirectoryStore{V}(p) where V = DirectoryStore{V, default_sep(V)}(p) end function Base.getindex(d::DirectoryStore, i::String) diff --git a/src/Storage/gcstore.jl b/src/Storage/gcstore.jl index 8e24cfe6..84dd6f6f 100644 --- a/src/Storage/gcstore.jl +++ b/src/Storage/gcstore.jl @@ -56,10 +56,10 @@ function _gcs_request_headers() return headers end -struct GCStore{S} <: AbstractStore{S} +struct GCStore{V,S} <: AbstractStore{V,S} bucket::String - function GCStore{S}(url::String) where S + function GCStore{V,S}(url::String) where {V,S} uri = URI(url) if uri.scheme == "gs" @@ -71,7 +71,8 @@ struct GCStore{S} <: AbstractStore{S} @debug "GCS bucket: $bucket" new(bucket) end - GCStore(url::String) = GCStore{'.'}(url) + GCStore(url::String) = GCStore{DV,DS}(url) + GCStore{V}(url::String) where V = GCStore{V, default_sep(V)}(url) end diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 223d4d4f..f335e0f7 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -10,12 +10,13 @@ datasets being served through the [xpublish](https://xpublish.readthedocs.io/en/ python package. In case you experience performance issues, one can try to use `HTTP.set_default_connection_limit!` to increase the number of concurrent connections. """ -struct HTTPStore{S} <: AbstractStore{S} +struct HTTPStore{V,S} <: AbstractStore{V,S} url::String allowed_codes::Set{Int} - HTTPStore{S}(url, allowed_codes = Set((404,))) where S = new{S}(url, allowed_codes) + HTTPStore{V,S}(url, allowed_codes = Set((404,))) where {V,S} = new{V,S}(url, allowed_codes) end -HTTPStore(url) = HTTPStore{'.'}(url) +HTTPStore(url) = HTTPStore{DV, DS}(url) +HTTPStore{V}(url) where V = HTTPStore{V, default_sep(V)}(url) function Base.getindex(s::HTTPStore, k::String) r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream) diff --git a/src/Storage/s3store.jl b/src/Storage/s3store.jl index f6afd1c2..f3d0b1fa 100644 --- a/src/Storage/s3store.jl +++ b/src/Storage/s3store.jl @@ -1,21 +1,23 @@ using AWSS3: AWSS3, s3_put, s3_get, s3_delete, s3_list_objects, s3_exists -struct S3Store{S} <: AbstractStore{S} +struct S3Store{V,S} <: AbstractStore{V,S} bucket::String aws::AWSS3.AWS.AbstractAWSConfig end -function S3Store{S}(bucket::String; +function S3Store{V,S}(bucket::String; aws = nothing, - ) where S + ) where {V,S} if aws === nothing aws = AWSS3.AWS.global_aws_config() end - S3Store(bucket, aws) + S3Store{V,S}(bucket, aws) end -S3Store(bucket, aws) = S3Store{'.'}(bucket, aws) -S3Store(bucket; aws = nothing) = S3Store{'.'}(bucket; aws) +S3Store(bucket, aws) = S3Store{DV,DS}(bucket, aws) +S3Store{V}(bucket, aws) where V = S3Store{V, default_sep(V)}(bucket, aws) +S3Store(bucket; aws = nothing) = S3Store{DV, DS}(bucket; aws) +S3Store{V}(bucket; aws = nothing) where V = S3Store{V, default_sep(V)}(bucket; aws) Base.show(io::IO,::S3Store) = print(io,"S3 Object Storage") diff --git a/src/Storage/zipstore.jl b/src/Storage/zipstore.jl index f8a68f7a..8fb5aca0 100644 --- a/src/Storage/zipstore.jl +++ b/src/Storage/zipstore.jl @@ -5,10 +5,11 @@ import ZipArchives A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file. """ -struct ZipStore{S, T <: AbstractVector{UInt8}} <: AbstractStore{S} +struct ZipStore{V, S, T <: AbstractVector{UInt8}} <: AbstractStore{V, S} r::ZipArchives.ZipBufferReader{T} - ZipStore{S}(data::AbstractVector{UInt8}) where S = new{S, ZipArchives.ZipBufferReader}(ZipArchives.ZipBufferReader(data)) - ZipStore(data::AbstractVector{UInt8}) = ZipStore{'.'}(data) + ZipStore{V,S}(data::T) where {V,S,T} = new{V, S, T}(ZipArchives.ZipBufferReader(data)) + ZipStore{V}(data::AbstractVector{UInt8}) where V = ZipStore{V, default_sep(V)}(data) + ZipStore(data::AbstractVector{UInt8}) = ZipStore{DV,DS}(data) end diff --git a/test/runtests.jl b/test/runtests.jl index c472eb1f..34790c9b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,7 @@ CondaPkg.add("zarr"; version="2.*") @testset "fields" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} + Zarr.DictStore{2, '.'}} @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @@ -40,7 +40,7 @@ CondaPkg.add("zarr"; version="2.*") @testset "methods" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} + Zarr.DictStore{2, '.'}} @test eltype(z) === Int64 @test ndims(z) === 2 @@ -69,12 +69,14 @@ CondaPkg.add("zarr"; version="2.*") @test JSON.parsefile("$dir/$name/.zarray") == Dict{String, Any}( "dtype" => " nothing, - "shape" => [3, 2], + "shape" => Any[3, 2], "order" => "C", "zarr_format" => 2, - "chunks" => [3, 2], + "chunks" => Any[3, 2], "fill_value" => nothing, - "compressor" => nothing) + "compressor" => nothing, + "dimension_separator" => "." + ) # call gc to avoid unlink: operation not permitted (EPERM) on Windows # might be because files are left open # from https://github.com/JuliaLang/julia/blob/f6344d32d3ebb307e2b54a77e042559f42d2ebf6/stdlib/SharedArrays/test/runtests.jl#L146 diff --git a/test/storage.jl b/test/storage.jl index 9e4fac73..1a73ad10 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -8,10 +8,31 @@ @test Zarr.normalize_path("/path/to/a") == "/path/to/a" end +@testset "Version and Dimension Separator" begin + let ci = CartesianIndex() + @test Zarr.citostring(ci, 2, '.') == "0" + @test Zarr.citostring(ci, 2, '/') == "0" + @test Zarr.citostring(ci, 3, '.') == "c.0" + @test Zarr.citostring(ci, 3, '/') == "c/0" + end + let ci = CartesianIndex(1,1,1) + @test Zarr.citostring(ci, 2, '.') == "0.0.0" + @test Zarr.citostring(ci, 2, '/') == "0/0/0" + @test Zarr.citostring(ci, 3, '.') == "c.0.0.0" + @test Zarr.citostring(ci, 3, '/') == "c/0/0/0" + end + let ci = CartesianIndex(1,3,5) + @test Zarr.citostring(ci, 2, '.') == "4.2.0" + @test Zarr.citostring(ci, 2, '/') == "4/2/0" + @test Zarr.citostring(ci, 3, '.') == "c.4.2.0" + @test Zarr.citostring(ci, 3, '/') == "c/4/2/0" + end +end + """ Function to test the interface of AbstractStore. Every complete implementation should pass this test. """ -function test_store_common(ds) +function test_store_common(ds::Zarr.AbstractStore{V,S}) where {V,S} @test !Zarr.is_zgroup(ds,"") ds[".zgroup"]=rand(UInt8,50) @test haskey(ds,".zgroup") @@ -31,17 +52,21 @@ function test_store_common(ds) @test Zarr.subdirs(ds,"bar") == String[] #Test getindex and setindex data = rand(UInt8,50) - ds["bar/0.0.0"] = data + first_ci_str = Zarr.citostring(CartesianIndex(1,1,1), V, S) + second_ci_str = Zarr.citostring(CartesianIndex(2,1,1), V, S) + ds["bar/" * first_ci_str] = data @test ds["bar/0.0.0"]==data @test Zarr.storagesize(ds,"bar")==50 - @test Zarr.isinitialized(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar/0.0.1") + @test Zarr.isinitialized(ds,"bar/" * first_ci_str) + @test !Zarr.isinitialized(ds,"bar/" * second_ci_str) Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) @test Zarr.getattrs(ds,"bar")==Dict("a"=>"b") - delete!(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar",CartesianIndex((0,0,0))) - @test !Zarr.isinitialized(ds,"bar/0.0.0") - ds["bar/0.0.0"] = data + delete!(ds,"bar/" * first_ci_str) + @test !Zarr.isinitialized(ds,"bar",CartesianIndex((1,1,1))) + @test !Zarr.isinitialized(ds,"bar/" * first_ci_str) + ds["bar/" * first_ci_str] = data + @test !Zarr.isinitialized(ds, "bar", CartesianIndex(0,0,0)) + @test Zarr.isinitialized(ds, "bar", CartesianIndex(1,1,1)) #Add tests for empty storage @test Zarr.isemptysub(ds,"ba") @test Zarr.isemptysub(ds,"ba/") From 362437677d10d09e73e11f7b4f5248159b149d8c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 17 Mar 2025 23:21:34 -0400 Subject: [PATCH 06/39] Check metadata for dimension_separator and zarr_format --- src/Storage/Storage.jl | 61 ++++++++++++++++++++++++++++++++++++++++++ src/Storage/gcstore.jl | 1 + src/Storage/http.jl | 13 ++++++++- src/Storage/s3store.jl | 1 + src/ZArray.jl | 27 ++++++++++++------- src/ZGroup.jl | 28 ++++++++++++++----- 6 files changed, 115 insertions(+), 16 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index 9a056b57..de40d32e 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -215,6 +215,19 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) #during auto-check of storage format when doing zopen storageregexlist = Pair[] +function Base.getproperty(store::AbstractStore{V,S}, sym::Symbol) where {V,S} + if sym == :dimension_separator + return S + elseif sym == :zarr_format + return V + else + return getfield(store, sym) + end +end +function Base.propertynames(store::AbstractStore) + return (:dimension_separator, :version, getfields(store)...) +end + include("directorystore.jl") include("dictstore.jl") include("s3store.jl") @@ -222,3 +235,51 @@ include("gcstore.jl") include("consolidated.jl") include("http.jl") include("zipstore.jl") + +# Itemize subtypes of AbstractStore for code generation below +const KnownAbstractStores = (DirectoryStore, GCStore, S3Store, ConsolidatedStore, DictStore, HTTPStore, ZipStore) + +""" + Zarr.set_dimension_separator(::AbstractStore{V}, sep::Char)::AbstractStore{V,sep} + +Returns an AbstractStore of the same type with the same `zarr_format` parameter, `V`, +but with a dimension separator of `sep`. + +# Examples + +``` +julia> Zarr.set_dimension_separator(Zarr.DictStore{2, '.'}(), '/') |> typeof +Zarr.DictStore{2, '/'} +``` + +""" +set_dimension_separator + +""" + set_zarr_format(::AbstractStore{<: Any, S}, zarr_format::Int)::AbstractStore{zarr_format,S} + +Returns an AbstractStore of the same type with the same `dimension_separator` parameter, `S`, +but with the specified `zarr_format` parameter. + +# Examples + +``` +julia> Zarr.set_zarr_format(Zarr.DictStore{2, '.'}(), 3) |> typeof +Zarr.DictStore{3, '.'} +``` + +""" +set_zarr_format + +for T in KnownAbstractStores + e = quote + # copy constructor to change zarr_format and dimension_separator parameters + (::Type{$T{V,S}})(store::$T) where {V,S} = + $T{V,S}(ntuple(i->getfield(store, i), nfields(store))...) + set_dimension_separator(store::$T{V}, sep::Char) where V = + $T{V,sep}(ntuple(i->getfield(store, i), nfields(store))...) + set_zarr_format(store::$T{<: Any, S}, zarr_format::Int) where S = + $T{zarr_format,S}(ntuple(i->getfield(store, i), nfields(store))...) + end + eval(e) +end diff --git a/src/Storage/gcstore.jl b/src/Storage/gcstore.jl index 84dd6f6f..5f0860a2 100644 --- a/src/Storage/gcstore.jl +++ b/src/Storage/gcstore.jl @@ -137,6 +137,7 @@ pushfirst!(storageregexlist,r"^http://storage.googleapis.com"=>GCStore) push!(storageregexlist,r"^gs://"=>GCStore) function storefromstring(::Type{<:GCStore}, url,_) + # TODO: Check metadata for version and dimension separator uri = URI(url) if uri.scheme == "gs" p = lstrip(uri.path,'/') diff --git a/src/Storage/http.jl b/src/Storage/http.jl index f335e0f7..e213642a 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -41,7 +41,18 @@ end push!(storageregexlist,r"^https://"=>HTTPStore) push!(storageregexlist,r"^http://"=>HTTPStore) -storefromstring(::Type{<:HTTPStore}, s,_) = ConsolidatedStore(HTTPStore(s),""),"" +function storefromstring(::Type{<:HTTPStore}, s,_) + http_store = HTTPStore(s) + if is_zarray(http_store, "") + meta = getmetadata(http_store, "", false) + http_store = HTTPStore{meta.zarr_format, meta.dimension_separator}(s) + end + if http_store["", ".zmetadata"] !== nothing + return ConsolidatedStore(http_store,""),"" + else + return http_store,"" + end +end """ missing_chunk_return_code!(s::HTTPStore, code::Union{Int,AbstractVector{Int}}) diff --git a/src/Storage/s3store.jl b/src/Storage/s3store.jl index f3d0b1fa..03681fa3 100644 --- a/src/Storage/s3store.jl +++ b/src/Storage/s3store.jl @@ -78,6 +78,7 @@ allstrings(v,prefixkey) = [rstrip(String(v[prefixkey]),'/')] push!(storageregexlist,r"^s3://"=>S3Store) function storefromstring(::Type{<:S3Store}, s, _) + # TODO: Check metadata for version and dimension separator decomp = split(s,"/",keepempty=false) bucket = decomp[2] path = join(decomp[3:end],"/") diff --git a/src/ZArray.jl b/src/ZArray.jl index de5a8ecd..0aa472b4 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -316,17 +316,24 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` function zcreate(::Type{T}, dims::Integer...; name="", path=nothing, + dimension_separator='.', kwargs... ) where T + + if dimension_separator isa AbstractString + # Convert AbstractString to Char + dimension_separator = only(dimension_separator) + end + if path===nothing - store = DictStore() + store = DictStore{DV, dimension_separator}() else - store = DirectoryStore(joinpath(path,name)) + store = DirectoryStore{DV, dimension_separator}(joinpath(path,name)) end zcreate(T, store, dims...; kwargs...) end -function zcreate(::Type{T},storage::AbstractStore, +function zcreate(::Type{T},storage::AbstractStore{<: Any,S}, dims...; path = "", chunks=dims, @@ -337,12 +344,14 @@ function zcreate(::Type{T},storage::AbstractStore, attrs=Dict(), writeable=true, indent_json=false, - dimension_separator='.' - ) where T - - if dimension_separator isa AbstractString - # Convert AbstractString to Char - dimension_separator = only(dimension_separator) + dimension_separator=nothing + ) where {T,S} + + if isnothing(dimension_separator) + dimension_separator = S + elseif dimension_separator != S + error("The dimension separator keyword value, $dimension_separator, + must agree with the dimension separator type parameter, $S") end length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) diff --git a/src/ZGroup.jl b/src/ZGroup.jl index 35515ed1..d0752ec0 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -20,10 +20,16 @@ function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: Abstract for d in subdirs(s,path) dshort = split(d,'/')[end] - m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) - if isa(m, ZArray) + subpath = _concatpath(path,dshort) + if is_zarray(s, subpath) + meta = getmetadata(s, subpath, false) + if s.dimension_separator != meta.dimension_separator + s = set_dimension_separator(s, meta.dimension_separator) + end + m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) arrays[dshort] = m - elseif isa(m, ZGroup) + elseif is_zgroup(s, subpath) + m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) groups[dshort] = m end end @@ -39,7 +45,7 @@ Works like `zopen` with the single difference that no error is thrown when the path or store does not point to a valid zarr array or group, but nothing is returned instead. """ -function zopen_noerr(s::AbstractStore, mode="r"; +function zopen_noerr(s::AbstractStore, mode="r"; consolidated = false, path="", lru = 0, @@ -116,8 +122,18 @@ function storefromstring(s, create=true) return storefromstring(t,s,create) end end - if create || isdir(s) - return DirectoryStore(s), "" + if create + return DirectoryStore(s), "" + elseif isdir(s) + # parse metadata to determine store kind + temp_store = DirectoryStore(s) + if is_zarray(temp_store, "") + meta = getmetadata(temp_store, "", false) + store = DirectoryStore{meta.zarr_format, meta.dimension_separator}(s) + else + store = temp_store + end + return store, "" else throw(ArgumentError("Path $s is not a directory.")) end From 2b3bbb266a91a0e677513dcbdc39af9f24a87ca9 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 25 Mar 2025 20:48:45 -0400 Subject: [PATCH 07/39] Implement VersionStorage wrapper rather than modifying AbstractStorage --- src/Storage/Storage.jl | 96 ++++------------------------------- src/Storage/consolidated.jl | 12 ++--- src/Storage/dictstore.jl | 6 +-- src/Storage/directorystore.jl | 8 ++- src/Storage/gcstore.jl | 9 ++-- src/Storage/http.jl | 25 ++++----- src/Storage/s3store.jl | 13 ++--- src/Storage/zipstore.jl | 9 ++-- src/ZArray.jl | 14 ++--- src/ZGroup.jl | 8 +-- test/runtests.jl | 12 ++--- test/storage.jl | 21 ++++++-- 12 files changed, 74 insertions(+), 159 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index de40d32e..30ff1e15 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -2,29 +2,12 @@ # Defines different storages for zarr arrays. Currently only regular files (DirectoryStore) # and Dictionaries are supported -# Default Zarr version -const DV = 2 - -# Default Zarr separator - -# Default Zarr v2 separator -const DS2 = '.' -# Default Zarr v3 separator -const DS3 = '/' - -default_sep(version) = version == 2 ? DS2 : DS3 -const DS = default_sep(DV) - """ - abstract type AbstractStore{V,S} + abstract type AbstractStore This the abstract supertype for all Zarr store implementations. Currently only regular files ([`DirectoryStore`](@ref)) and Dictionaries are supported. -# Type Parameters -V is the version, either 2 or 3 -S is the dimension separator - ## Interface All subtypes of `AbstractStore` must implement the following methods: @@ -41,7 +24,7 @@ They may optionally implement the following methods: - [`store_read_strategy(s::AbstractStore)`](@ref store_read_strategy): return the read strategy for the given store. See [`SequentialRead`](@ref) and [`ConcurrentRead`](@ref). """ -abstract type AbstractStore{V,S} end +abstract type AbstractStore end #Define the interface """ @@ -87,18 +70,17 @@ function subkeys end Deletes the given key from the store. """ -@inline citostring(i::CartesianIndex, version::Int=DV, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) -@inline citostring(::CartesianIndex{0}, version::Int=DV, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) -citostring(i::CartesianIndex, s::AbstractStore{V, S}) where {V,S} = citostring(i, V, S) +citostring(i::CartesianIndex) = join(reverse((i - oneunit(i)).I), '.') +citostring(::CartesianIndex{0}) = "0" _concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s -Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i, s)] +Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i)] Base.getindex(s::AbstractStore, p, i) = s[_concatpath(p,i)] -Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i, s)) +Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i)) Base.delete!(s::AbstractStore, p, i) = delete!(s, _concatpath(p,i)) Base.haskey(s::AbstractStore, k) = isinitialized(s,k) Base.setindex!(s::AbstractStore,v,p,i) = setindex!(s,v,_concatpath(p,i)) -Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i, s)]=v +Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i)]=v maybecopy(x) = copy(x) @@ -129,7 +111,7 @@ end is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) -isinitialized(s::AbstractStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) +isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) isinitialized(s::AbstractStore, i) = s[i] !== nothing @@ -215,19 +197,7 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) #during auto-check of storage format when doing zopen storageregexlist = Pair[] -function Base.getproperty(store::AbstractStore{V,S}, sym::Symbol) where {V,S} - if sym == :dimension_separator - return S - elseif sym == :zarr_format - return V - else - return getfield(store, sym) - end -end -function Base.propertynames(store::AbstractStore) - return (:dimension_separator, :version, getfields(store)...) -end - +include("versionedstore.jl") include("directorystore.jl") include("dictstore.jl") include("s3store.jl") @@ -235,51 +205,3 @@ include("gcstore.jl") include("consolidated.jl") include("http.jl") include("zipstore.jl") - -# Itemize subtypes of AbstractStore for code generation below -const KnownAbstractStores = (DirectoryStore, GCStore, S3Store, ConsolidatedStore, DictStore, HTTPStore, ZipStore) - -""" - Zarr.set_dimension_separator(::AbstractStore{V}, sep::Char)::AbstractStore{V,sep} - -Returns an AbstractStore of the same type with the same `zarr_format` parameter, `V`, -but with a dimension separator of `sep`. - -# Examples - -``` -julia> Zarr.set_dimension_separator(Zarr.DictStore{2, '.'}(), '/') |> typeof -Zarr.DictStore{2, '/'} -``` - -""" -set_dimension_separator - -""" - set_zarr_format(::AbstractStore{<: Any, S}, zarr_format::Int)::AbstractStore{zarr_format,S} - -Returns an AbstractStore of the same type with the same `dimension_separator` parameter, `S`, -but with the specified `zarr_format` parameter. - -# Examples - -``` -julia> Zarr.set_zarr_format(Zarr.DictStore{2, '.'}(), 3) |> typeof -Zarr.DictStore{3, '.'} -``` - -""" -set_zarr_format - -for T in KnownAbstractStores - e = quote - # copy constructor to change zarr_format and dimension_separator parameters - (::Type{$T{V,S}})(store::$T) where {V,S} = - $T{V,S}(ntuple(i->getfield(store, i), nfields(store))...) - set_dimension_separator(store::$T{V}, sep::Char) where V = - $T{V,sep}(ntuple(i->getfield(store, i), nfields(store))...) - set_zarr_format(store::$T{<: Any, S}, zarr_format::Int) where S = - $T{zarr_format,S}(ntuple(i->getfield(store, i), nfields(store))...) - end - eval(e) -end diff --git a/src/Storage/consolidated.jl b/src/Storage/consolidated.jl index 2429622c..0b28f553 100644 --- a/src/Storage/consolidated.jl +++ b/src/Storage/consolidated.jl @@ -3,24 +3,18 @@ A store that wraps any other AbstractStore but has access to the consolidated me stored in the .zmetadata key. Whenever data attributes or metadata are accessed, the data will be read from the dictionary instead. """ -struct ConsolidatedStore{V,S,P} <: AbstractStore{V,S} +struct ConsolidatedStore{P} <: AbstractStore parent::P path::String cons::Dict{String,Any} end -function ConsolidatedStore{V,S}(s::AbstractStore, p) where {V,S} +function ConsolidatedStore(s::AbstractStore, p) d = s[p, ".zmetadata"] if d === nothing throw(ArgumentError("Could not find consolidated metadata for store $s")) end - ConsolidatedStore{V, S, typeof(s)}(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) + ConsolidatedStore(s,p,JSON.parse(String(Zarr.maybecopy(d)))["metadata"]) end -ConsolidatedStore{V}(s::AbstractStore, p) where V = ConsolidatedStore{V, default_sep(V)}(s, p) -ConsolidatedStore(s::AbstractStore, p) = ConsolidatedStore{DV,DS}(s, p) - -ConsolidatedStore(s::AbstractStore, p, d) = ConsolidatedStore{DV, DS}(s,p,d) -ConsolidatedStore{V}(s::AbstractStore, p, d) where V = ConsolidatedStore{V, default_sep(V)}(s,p,d) -ConsolidatedStore{V,S}(s::AbstractStore, p, d) where {V,S} = ConsolidatedStore{V, default_sep(V), typeof(s)}(s,p,d) function Base.show(io::IO,d::ConsolidatedStore) b = IOBuffer() diff --git a/src/Storage/dictstore.jl b/src/Storage/dictstore.jl index f6598971..7815ed20 100644 --- a/src/Storage/dictstore.jl +++ b/src/Storage/dictstore.jl @@ -1,10 +1,8 @@ # Stores data in a simple dict in memory -struct DictStore{V,S} <: AbstractStore{V,S} +struct DictStore <: AbstractStore a::Dict{String,Vector{UInt8}} - DictStore{V,S}(a=Dict{String,Vector{UInt8}}()) where {V,S} = new{V,S}(a) end -DictStore(a=Dict{String,Vector{UInt8}}()) = DictStore{DV,DS}(a) -DictStore{V}(a=Dict{String,Vector{UInt8}}()) where V = DictStore{V, default_sep(V)}(a) +DictStore() = DictStore(Dict{String,Vector{UInt8}}()) Base.show(io::IO,d::DictStore) = print(io,"Dictionary Storage") function _pdict(d::DictStore,p) diff --git a/src/Storage/directorystore.jl b/src/Storage/directorystore.jl index 0b7f6434..6ded94fb 100644 --- a/src/Storage/directorystore.jl +++ b/src/Storage/directorystore.jl @@ -9,14 +9,12 @@ function normalize_path(p::AbstractString) end # Stores files in a regular file system -struct DirectoryStore{V,S} <: AbstractStore{V,S} +struct DirectoryStore <: AbstractStore folder::String - function DirectoryStore{V,S}(p) where {V,S} + function DirectoryStore(p) mkpath(normalize_path(p)) - new{V,S}(normalize_path(p)) + new(normalize_path(p)) end - DirectoryStore(p) = DirectoryStore{DV,DS}(p) - DirectoryStore{V}(p) where V = DirectoryStore{V, default_sep(V)}(p) end function Base.getindex(d::DirectoryStore, i::String) diff --git a/src/Storage/gcstore.jl b/src/Storage/gcstore.jl index 5f0860a2..5f85820d 100644 --- a/src/Storage/gcstore.jl +++ b/src/Storage/gcstore.jl @@ -56,10 +56,10 @@ function _gcs_request_headers() return headers end -struct GCStore{V,S} <: AbstractStore{V,S} +struct GCStore <: AbstractStore bucket::String - function GCStore{V,S}(url::String) where {V,S} + function GCStore(url::String) uri = URI(url) if uri.scheme == "gs" @@ -71,8 +71,6 @@ struct GCStore{V,S} <: AbstractStore{V,S} @debug "GCS bucket: $bucket" new(bucket) end - GCStore(url::String) = GCStore{DV,DS}(url) - GCStore{V}(url::String) where V = GCStore{V, default_sep(V)}(url) end @@ -137,7 +135,6 @@ pushfirst!(storageregexlist,r"^http://storage.googleapis.com"=>GCStore) push!(storageregexlist,r"^gs://"=>GCStore) function storefromstring(::Type{<:GCStore}, url,_) - # TODO: Check metadata for version and dimension separator uri = URI(url) if uri.scheme == "gs" p = lstrip(uri.path,'/') @@ -150,4 +147,4 @@ function storefromstring(::Type{<:GCStore}, url,_) return GCStore(url),p end -store_read_strategy(::GCStore) = ConcurrentRead(concurrent_io_tasks[]) +store_read_strategy(::GCStore) = ConcurrentRead(concurrent_io_tasks[]) \ No newline at end of file diff --git a/src/Storage/http.jl b/src/Storage/http.jl index e213642a..523d701f 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -10,13 +10,11 @@ datasets being served through the [xpublish](https://xpublish.readthedocs.io/en/ python package. In case you experience performance issues, one can try to use `HTTP.set_default_connection_limit!` to increase the number of concurrent connections. """ -struct HTTPStore{V,S} <: AbstractStore{V,S} +struct HTTPStore <: AbstractStore url::String allowed_codes::Set{Int} - HTTPStore{V,S}(url, allowed_codes = Set((404,))) where {V,S} = new{V,S}(url, allowed_codes) + HTTPStore(url, allowed_codes = Set((404,))) = new(url, allowed_codes) end -HTTPStore(url) = HTTPStore{DV, DS}(url) -HTTPStore{V}(url) where V = HTTPStore{V, default_sep(V)}(url) function Base.getindex(s::HTTPStore, k::String) r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream) @@ -43,15 +41,18 @@ push!(storageregexlist,r"^https://"=>HTTPStore) push!(storageregexlist,r"^http://"=>HTTPStore) function storefromstring(::Type{<:HTTPStore}, s,_) http_store = HTTPStore(s) - if is_zarray(http_store, "") - meta = getmetadata(http_store, "", false) - http_store = HTTPStore{meta.zarr_format, meta.dimension_separator}(s) - end - if http_store["", ".zmetadata"] !== nothing - return ConsolidatedStore(http_store,""),"" - else - return http_store,"" + try + if is_zarray(http_store, "") + meta = getmetadata(http_store, "", false) + http_store = VersionedStore{meta.zarr_format, meta.dimension_separator}(http_store) + end + if http_store["", ".zmetadata"] !== nothing + return ConsolidatedStore(http_store,""),"" + end + catch err + @warn exception=err "Additional metadata was not available for HTTPStore." end + return http_store,"" end """ diff --git a/src/Storage/s3store.jl b/src/Storage/s3store.jl index 03681fa3..aaab004f 100644 --- a/src/Storage/s3store.jl +++ b/src/Storage/s3store.jl @@ -1,23 +1,19 @@ using AWSS3: AWSS3, s3_put, s3_get, s3_delete, s3_list_objects, s3_exists -struct S3Store{V,S} <: AbstractStore{V,S} +struct S3Store <: AbstractStore bucket::String aws::AWSS3.AWS.AbstractAWSConfig end -function S3Store{V,S}(bucket::String; +function S3Store(bucket::String; aws = nothing, - ) where {V,S} + ) if aws === nothing aws = AWSS3.AWS.global_aws_config() end - S3Store{V,S}(bucket, aws) + S3Store(bucket, aws) end -S3Store(bucket, aws) = S3Store{DV,DS}(bucket, aws) -S3Store{V}(bucket, aws) where V = S3Store{V, default_sep(V)}(bucket, aws) -S3Store(bucket; aws = nothing) = S3Store{DV, DS}(bucket; aws) -S3Store{V}(bucket; aws = nothing) where V = S3Store{V, default_sep(V)}(bucket; aws) Base.show(io::IO,::S3Store) = print(io,"S3 Object Storage") @@ -78,7 +74,6 @@ allstrings(v,prefixkey) = [rstrip(String(v[prefixkey]),'/')] push!(storageregexlist,r"^s3://"=>S3Store) function storefromstring(::Type{<:S3Store}, s, _) - # TODO: Check metadata for version and dimension separator decomp = split(s,"/",keepempty=false) bucket = decomp[2] path = join(decomp[3:end],"/") diff --git a/src/Storage/zipstore.jl b/src/Storage/zipstore.jl index 8fb5aca0..8e8bbd27 100644 --- a/src/Storage/zipstore.jl +++ b/src/Storage/zipstore.jl @@ -5,14 +5,13 @@ import ZipArchives A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file. """ -struct ZipStore{V, S, T <: AbstractVector{UInt8}} <: AbstractStore{V, S} +struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore r::ZipArchives.ZipBufferReader{T} - ZipStore{V,S}(data::T) where {V,S,T} = new{V, S, T}(ZipArchives.ZipBufferReader(data)) - ZipStore{V}(data::AbstractVector{UInt8}) where V = ZipStore{V, default_sep(V)}(data) - ZipStore(data::AbstractVector{UInt8}) = ZipStore{DV,DS}(data) end +ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data)) + Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage") function Base.getindex(d::ZipStore, k::AbstractString)::Union{Nothing, Vector{UInt8}} @@ -95,4 +94,4 @@ function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String) for subdir in subdirs(s, p) _writezip(w, s, _make_prefix(p)*subdir) end -end +end \ No newline at end of file diff --git a/src/ZArray.jl b/src/ZArray.jl index 0aa472b4..961c798f 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -326,14 +326,14 @@ function zcreate(::Type{T}, dims::Integer...; end if path===nothing - store = DictStore{DV, dimension_separator}() + store = VersionedStore{DV, dimension_separator}(DictStore()) else - store = DirectoryStore{DV, dimension_separator}(joinpath(path,name)) + store = VersionedStore{DV, dimension_separator}(DirectoryStore(joinpath(path,name))) end zcreate(T, store, dims...; kwargs...) end -function zcreate(::Type{T},storage::AbstractStore{<: Any,S}, +function zcreate(::Type{T},storage::AbstractStore, dims...; path = "", chunks=dims, @@ -345,13 +345,13 @@ function zcreate(::Type{T},storage::AbstractStore{<: Any,S}, writeable=true, indent_json=false, dimension_separator=nothing - ) where {T,S} + ) where {T} if isnothing(dimension_separator) - dimension_separator = S - elseif dimension_separator != S + dimension_separator = Zarr.dimension_separator(storage) + elseif dimension_separator != Zarr.dimension_separator(storage) error("The dimension separator keyword value, $dimension_separator, - must agree with the dimension separator type parameter, $S") + must agree with the dimension separator type parameter, $(Zarr.dimension_separator(storage))") end length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) diff --git a/src/ZGroup.jl b/src/ZGroup.jl index d0752ec0..e4d2100b 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -23,7 +23,7 @@ function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: Abstract subpath = _concatpath(path,dshort) if is_zarray(s, subpath) meta = getmetadata(s, subpath, false) - if s.dimension_separator != meta.dimension_separator + if dimension_separator(s) != meta.dimension_separator s = set_dimension_separator(s, meta.dimension_separator) end m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) @@ -123,15 +123,15 @@ function storefromstring(s, create=true) end end if create - return DirectoryStore(s), "" + return VersionedStore(DirectoryStore(s)), "" elseif isdir(s) # parse metadata to determine store kind temp_store = DirectoryStore(s) if is_zarray(temp_store, "") meta = getmetadata(temp_store, "", false) - store = DirectoryStore{meta.zarr_format, meta.dimension_separator}(s) + store = VersionedStore{meta.zarr_format, meta.dimension_separator}(temp_store) else - store = temp_store + store = VersionedStore(temp_store) end return store, "" else diff --git a/test/runtests.jl b/test/runtests.jl index 34790c9b..035ca33f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,11 +15,11 @@ CondaPkg.add("zarr"; version="2.*") @testset "fields" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore{2, '.'}} + Zarr.VersionedStore{2, '.', Zarr.DictStore}} - @test length(z.storage.a) === 3 - @test length(z.storage.a["0.0"]) === 64 - @test eltype(z.storage.a["0.0"]) === UInt8 + @test length(z.storage.parent.a) === 3 + @test length(z.storage.parent.a["0.0"]) === 64 + @test eltype(z.storage.parent.a["0.0"]) === UInt8 @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' @test z.metadata.chunks === (2, 3) @@ -40,7 +40,7 @@ CondaPkg.add("zarr"; version="2.*") @testset "methods" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore{2, '.'}} + Zarr.VersionedStore{2, '.', Zarr.DictStore}} @test eltype(z) === Int64 @test ndims(z) === 2 @@ -60,7 +60,7 @@ CondaPkg.add("zarr"; version="2.*") compressor=Zarr.NoCompressor()) @test z.metadata.compressor === Zarr.NoCompressor() - @test z.storage === Zarr.DirectoryStore("$dir/$name") + @test z.storage === Zarr.VersionedStore{2 ,'.'}(Zarr.DirectoryStore("$dir/$name")) @test isdir("$dir/$name") @test ispath("$dir/$name/.zarray") @test ispath("$dir/$name/.zattrs") diff --git a/test/storage.jl b/test/storage.jl index 1a73ad10..be97f402 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -32,7 +32,7 @@ end """ Function to test the interface of AbstractStore. Every complete implementation should pass this test. """ -function test_store_common(ds::Zarr.AbstractStore{V,S}) where {V,S} +function test_store_common(ds::Zarr.AbstractStore) @test !Zarr.is_zgroup(ds,"") ds[".zgroup"]=rand(UInt8,50) @test haskey(ds,".zgroup") @@ -52,6 +52,8 @@ function test_store_common(ds::Zarr.AbstractStore{V,S}) where {V,S} @test Zarr.subdirs(ds,"bar") == String[] #Test getindex and setindex data = rand(UInt8,50) + V = Zarr.zarr_format(ds) + S = Zarr.dimension_separator(ds) first_ci_str = Zarr.citostring(CartesianIndex(1,1,1), V, S) second_ci_str = Zarr.citostring(CartesianIndex(2,1,1), V, S) ds["bar/" * first_ci_str] = data @@ -185,6 +187,15 @@ end run(s, wait=false) cfg = MinioConfig("http://localhost:9001") Zarr.AWSS3.global_aws_config(cfg) + # Try to communicate with the server for 10 seconds + for i in 1:10 + try + s3_list_objects(cfg) + break + catch err + sleep(1) + end + end Zarr.AWSS3.S3.create_bucket("zarrdata") ds = S3Store("zarrdata") test_store_common(ds) @@ -256,10 +267,10 @@ end g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5),fill_value = -1) @async HTTP.serve(Zarr.zarr_req_handler(s,g.path,403),ip,port,server=server) - g3 = zopen("http://$ip:$port") - @test_throws "Received error code 403" g3["a"][:,:] - Zarr.missing_chunk_return_code!(g3.storage,403) - @test all(==(-1),g3["a"][:,:]) + @test_throws "Received error code 403" zopen("http://$ip:$port") + # @test_throws "Received error code 403" g3["a"][:,:] + # Zarr.missing_chunk_return_code!(g3.storage,403) + # @test all(==(-1),g3["a"][:,:]) close(server) end From 5f35ebf9b29456322812378b10d4b04cff73a10c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 18:29:28 -0400 Subject: [PATCH 08/39] Fix ConslidatedStore wrapper around HTTP This reduces the test diff --- src/Storage/http.jl | 6 +++--- test/storage.jl | 17 ++++------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 523d701f..1263f8c2 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -42,13 +42,13 @@ push!(storageregexlist,r"^http://"=>HTTPStore) function storefromstring(::Type{<:HTTPStore}, s,_) http_store = HTTPStore(s) try + if http_store["", ".zmetadata"] !== nothing + http_store = ConsolidatedStore(http_store,"") + end if is_zarray(http_store, "") meta = getmetadata(http_store, "", false) http_store = VersionedStore{meta.zarr_format, meta.dimension_separator}(http_store) end - if http_store["", ".zmetadata"] !== nothing - return ConsolidatedStore(http_store,""),"" - end catch err @warn exception=err "Additional metadata was not available for HTTPStore." end diff --git a/test/storage.jl b/test/storage.jl index be97f402..320239e9 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -187,15 +187,6 @@ end run(s, wait=false) cfg = MinioConfig("http://localhost:9001") Zarr.AWSS3.global_aws_config(cfg) - # Try to communicate with the server for 10 seconds - for i in 1:10 - try - s3_list_objects(cfg) - break - catch err - sleep(1) - end - end Zarr.AWSS3.S3.create_bucket("zarrdata") ds = S3Store("zarrdata") test_store_common(ds) @@ -267,10 +258,10 @@ end g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5),fill_value = -1) @async HTTP.serve(Zarr.zarr_req_handler(s,g.path,403),ip,port,server=server) - @test_throws "Received error code 403" zopen("http://$ip:$port") - # @test_throws "Received error code 403" g3["a"][:,:] - # Zarr.missing_chunk_return_code!(g3.storage,403) - # @test all(==(-1),g3["a"][:,:]) + g3 = zopen("http://$ip:$port") + @test_throws "Received error code 403" g3["a"][:,:] + Zarr.missing_chunk_return_code!(g3.storage,403) + @test all(==(-1),g3["a"][:,:]) close(server) end From c685387a1b9c18e4377020b2742a231b4a62626a Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 18:43:40 -0400 Subject: [PATCH 09/39] Add getproperty forwarding from VersionedStorage This also reduces the test diff --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 035ca33f..a5454557 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,9 +17,9 @@ CondaPkg.add("zarr"; version="2.*") @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, Zarr.VersionedStore{2, '.', Zarr.DictStore}} - @test length(z.storage.parent.a) === 3 - @test length(z.storage.parent.a["0.0"]) === 64 - @test eltype(z.storage.parent.a["0.0"]) === UInt8 + @test length(z.storage.a) === 3 + @test length(z.storage.a["0.0"]) === 64 + @test eltype(z.storage.a["0.0"]) === UInt8 @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' @test z.metadata.chunks === (2, 3) From 8d5606d6acc0e6b9fbc443bc13aded4456536063 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 19:05:10 -0400 Subject: [PATCH 10/39] Add some tests for propertynames --- src/metadata.jl | 1 + test/runtests.jl | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/metadata.jl b/src/metadata.jl index bfadfb03..1391a131 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -152,6 +152,7 @@ function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where end return getfield(m, name) end +Base.propertynames(m::Metadata) = (fieldnames(Metadata)..., :dimension_separator) #To make unit tests pass with ref shape import Base.== diff --git a/test/runtests.jl b/test/runtests.jl index a5454557..ad434fe1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,6 +17,7 @@ CondaPkg.add("zarr"; version="2.*") @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, Zarr.VersionedStore{2, '.', Zarr.DictStore}} + @test :a ∈ propertynames(z.storage) @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @test eltype(z.storage.a["0.0"]) === UInt8 @@ -31,6 +32,8 @@ CondaPkg.add("zarr"; version="2.*") @test z.metadata.compressor.shuffle === 1 @test z.attrs == Dict{Any, Any}() @test z.writeable === true + @test z.metadata.dimension_separator === Zarr.DS + @test :dimension_separator ∈ propertynames(z.metadata) @test_throws ArgumentError zzeros(Int64,2,3, chunks = (0,1)) @test_throws ArgumentError zzeros(Int64,0,-1) @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), zarr_format = 3) From a6fcc2b7837d0f45169760ce03ec67dce6786610 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 19:11:10 -0400 Subject: [PATCH 11/39] Add Storage/versionstore.jl --- src/Storage/versionedstore.jl | 124 ++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 src/Storage/versionedstore.jl diff --git a/src/Storage/versionedstore.jl b/src/Storage/versionedstore.jl new file mode 100644 index 00000000..cec72ee0 --- /dev/null +++ b/src/Storage/versionedstore.jl @@ -0,0 +1,124 @@ +# Default Zarr version +const DV = 2 + +# Default Zarr separator + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(version) = version == 2 ? DS2 : DS3 +const DS = default_sep(DV) + +struct VersionedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore + parent::STORE +end +VersionedStore(args...) = VersionedStore{DV,DS}(args...) +VersionedStore{V}(args...) where V = VersionedStore{V, default_sep(V)}(args...) +VersionedStore{<: Any, S}(args...) where S = VersionedStore{DV, S}(args...) +function VersionedStore{V,S}(store::AbstractStore) where {V,S} + return VersionedStore{V,S,typeof(store)}(store) +end +function VersionedStore{V,S}(store::VersionedStore) where {V,S} + p = parent(store) + return VersionedStore{V,S,typeof(p)}(p) +end + +Base.parent(store::VersionedStore) = store.parent + +@inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) +@inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) +citostring(i::CartesianIndex, s::VersionedStore{V, S}) where {V,S} = citostring(i, V, S) + +Base.getindex(s::VersionedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] +Base.delete!(s::VersionedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) +Base.setindex!(s::VersionedStore, v, p, i::CartesianIndex) = s[p, citostring(i,s)]=v + +isinitialized(s::VersionedStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) + +""" +- [`storagesize(d::AbstractStore, p::AbstractString)`](@ref storagesize) +- [`subdirs(d::AbstractStore, p::AbstractString)`](@ref subdirs) +- [`subkeys(d::AbstractStore, p::AbstractString)`](@ref subkeys) +- [`isinitialized(d::AbstractStore, p::AbstractString)`](@ref isinitialized) +- [`storefromstring(::Type{<: AbstractStore}, s, _)`](@ref storefromstring) +- `Base.getindex(d::AbstractStore, i::AbstractString)`: return the data stored in key `i` as a Vector{UInt8} +- `Base.setindex!(d::AbstractStore, v, i::AbstractString)`: write the values in `v` to the key `i` of the given store `d` +""" + +storagesize(d::VersionedStore, p::AbstractString) = storagesize(parent(d), p) +subdirs(d::VersionedStore, p::AbstractString) = subdirs(parent(d), p) +subkeys(d::VersionedStore, p::AbstractString) = subkeys(parent(d), p) +isinitialized(d::VersionedStore, p::AbstractString) = isinitialized(parent(d), p) +storefromstring(::Type{VersionedStore{<: Any, <: Any, STORE}}, s, _) where STORE = VersionedStore{DV,DS}(storefromstring(STORE, s)) +storefromstring(::Type{VersionedStore{V,S}}, s, _) where {V,S} = VersionedStore{DV,DS}(storefromstring(s)) +storefromstring(::Type{VersionedStore{V,S,STORE}}, s, _) where {V,S,STORE} = VersionedStore{V,S,STORE}(storefromstring(STORE, s)) +Base.getindex(d::VersionedStore, i::AbstractString) = getindex(parent(d), i) +Base.setindex!(d::VersionedStore, v, i::AbstractString) = setindex!(parent(d), v, i) +Base.delete!(d::VersionedStore, i::AbstractString) = delete!(parent(d), i) + + +function Base.getproperty(store::VersionedStore{V,S}, sym::Symbol) where {V,S} + if sym == :dimension_separator + return S + elseif sym == :zarr_format + return V + elseif sym ∈ propertynames(getfield(store, :parent)) + # Support forwarding of properties to parent + return getproperty(store.parent, sym) + else + getfield(store, sym) + end +end +function Base.propertynames(store::VersionedStore) + return (:dimension_separator, :zarr_format, fieldnames(typeof(store))..., propertynames(store.parent)...) +end + + +""" + Zarr.set_dimension_separator(store::VersionedStore{V}, sep::Char)::VersionedStore{V,sep} + +Returns a VersionedStore of the same type with the same `zarr_format` parameter, `V`, +but with a dimension separator of `sep`. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_dimension_separator(Zarr.VersionedStore{2, '.'}(Zarr.DictStore(), '/')) |> typeof +Zarr.VersionedStore{2, '/',Zarr.DictStore} +``` + +""" +function set_dimension_separator(store::VersionedStore{V}, sep::Char) where V + return VersionedStore{V,sep}(store) +end +function set_dimension_separator(store::AbstractStore, sep::Char) + return VersionedStore{<: Any,sep}(store) +end + +""" + set_zarr_format(::VersionedStore{<: Any, S}, zarr_format::Int)::VersionedStore{zarr_format,S} + +Returns a VersionedStore of the same type with the same `dimension_separator` parameter, `S`, +but with the specified `zarr_format` parameter. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_zarr_format(Zarr.VersionedStore{2, '.'}(Zarr.DictStore(), 3)) |> typeof +Zarr.VersionedStore{3, '.', DictStore} +``` + +""" +function set_zarr_format(store::VersionedStore{<: Any, S}, zarr_format::Int) where S + return VersionedStore{zarr_format,S}(store) +end +function set_zarr_format(store::AbstractStore, zarr_format::Int) + return VersionedStore{zarr_format}(store) +end + +dimension_separator(::AbstractStore) = DS +dimension_separator(::VersionedStore{<: Any,S}) where S = S +zarr_format(::AbstractStore) = DV +zarr_format(::VersionedStore{V}) where V = V From f6883f808aef2481fa1191568540f6008f884af7 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 23:42:33 -0400 Subject: [PATCH 12/39] Add VersionedStorage param change constructors --- src/Storage/versionedstore.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Storage/versionedstore.jl b/src/Storage/versionedstore.jl index cec72ee0..17aeaf0a 100644 --- a/src/Storage/versionedstore.jl +++ b/src/Storage/versionedstore.jl @@ -15,8 +15,11 @@ struct VersionedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore parent::STORE end VersionedStore(args...) = VersionedStore{DV,DS}(args...) +VersionedStore(s::VersionedStore) = s VersionedStore{V}(args...) where V = VersionedStore{V, default_sep(V)}(args...) +VersionedStore{V}(s::VersionedStore{<:Any,S}) where {V,S} = VersionedStore{V, S}(s) VersionedStore{<: Any, S}(args...) where S = VersionedStore{DV, S}(args...) +VersionedStore{<: Any, S}(s::VersionedStore{V}) where {V,S} = VersionedStore{V, S}(s) function VersionedStore{V,S}(store::AbstractStore) where {V,S} return VersionedStore{V,S,typeof(store)}(store) end From 3cf746da8a5b9e7fa6c6a3ecfe17c0e6cf4eeb50 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 6 May 2025 18:41:39 -0400 Subject: [PATCH 13/39] Add V2 chunk encoding support --- src/Storage/versionedstore.jl | 10 +++++++++- test/storage.jl | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Storage/versionedstore.jl b/src/Storage/versionedstore.jl index 17aeaf0a..45f2bad8 100644 --- a/src/Storage/versionedstore.jl +++ b/src/Storage/versionedstore.jl @@ -8,9 +8,16 @@ const DS2 = '.' # Default Zarr v3 separator const DS3 = '/' -default_sep(version) = version == 2 ? DS2 : DS3 +default_sep(version) = version == 2 ? DS2 : + version == 3 ? DS3 : + error("Unknown version: $version") const DS = default_sep(DV) +# Chunk Key Encodings for Zarr v3 +# A Char is the separator for the default chunk key encoding +struct V2ChunkKeyEncoding{SEP} end + +# Version store differentiates between Zarr format versions struct VersionedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore parent::STORE end @@ -32,6 +39,7 @@ Base.parent(store::VersionedStore) = store.parent @inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) @inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) +@inline citostring(i::CartesianIndex, ::Int, ::Type{V2ChunkKeyEncoding{S}}) where S = citostring(i, 2, S) citostring(i::CartesianIndex, s::VersionedStore{V, S}) where {V,S} = citostring(i, V, S) Base.getindex(s::VersionedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] diff --git a/test/storage.jl b/test/storage.jl index 320239e9..1ef34d72 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -9,21 +9,29 @@ end @testset "Version and Dimension Separator" begin + v2cke_period = Zarr.V2ChunkKeyEncoding{'.'} + v2cke_slash = Zarr.V2ChunkKeyEncoding{'/'} let ci = CartesianIndex() @test Zarr.citostring(ci, 2, '.') == "0" @test Zarr.citostring(ci, 2, '/') == "0" + @test Zarr.citostring(ci, 3, v2cke_period) == "0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "0" @test Zarr.citostring(ci, 3, '.') == "c.0" @test Zarr.citostring(ci, 3, '/') == "c/0" end let ci = CartesianIndex(1,1,1) @test Zarr.citostring(ci, 2, '.') == "0.0.0" @test Zarr.citostring(ci, 2, '/') == "0/0/0" + @test Zarr.citostring(ci, 3, v2cke_period) == "0.0.0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "0/0/0" @test Zarr.citostring(ci, 3, '.') == "c.0.0.0" @test Zarr.citostring(ci, 3, '/') == "c/0/0/0" end let ci = CartesianIndex(1,3,5) @test Zarr.citostring(ci, 2, '.') == "4.2.0" @test Zarr.citostring(ci, 2, '/') == "4/2/0" + @test Zarr.citostring(ci, 3, v2cke_period) == "4.2.0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "4/2/0" @test Zarr.citostring(ci, 3, '.') == "c.4.2.0" @test Zarr.citostring(ci, 3, '/') == "c/4/2/0" end From d218dc249cd0f238fb364b479fbec675d5a4620a Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 6 May 2025 20:06:26 -0400 Subject: [PATCH 14/39] Fix Base.UInt8 constructor for ASCIIChar --- src/metadata.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metadata.jl b/src/metadata.jl index 1391a131..fa564e37 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -17,7 +17,7 @@ using .MaxLengthStrings: MaxLengthString primitive type ASCIIChar <: AbstractChar 8 end ASCIIChar(x::UInt8) = reinterpret(ASCIIChar, x) ASCIIChar(x::Integer) = ASCIIChar(UInt8(x)) -UInt8(x::ASCIIChar) = reinterpret(UInt8, x) +Base.UInt8(x::ASCIIChar) = reinterpret(UInt8, x) Base.codepoint(x::ASCIIChar) = UInt8(x) Base.show(io::IO, x::ASCIIChar) = print(io, Char(x)) Base.zero(::Union{ASCIIChar,Type{ASCIIChar}}) = ASCIIChar(Base.zero(UInt8)) From 6f722b575d3fea64435139f2b2e70663eea8d31a Mon Sep 17 00:00:00 2001 From: nhz2 Date: Fri, 7 Mar 2025 10:12:22 -0500 Subject: [PATCH 15/39] Add ZstdCompressor --- Project.toml | 2 ++ docs/src/reference.md | 2 +- src/Compressors/Compressors.jl | 1 + src/Compressors/zstd.jl | 49 ++++++++++++++++++++++++++++++++++ test/python.jl | 6 +++-- 5 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 src/Compressors/zstd.jl diff --git a/Project.toml b/Project.toml index 8303d7bb..54db8d2e 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.9.4" [deps] AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" Blosc = "a74b3585-a348-5f62-a45c-50e91977d574" +ChunkCodecLibZstd = "55437552-ac27-4d47-9aa3-63184e8fd398" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DateTimes64 = "b342263e-b350-472a-b1a9-8dfd21b51589" @@ -22,6 +23,7 @@ ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c" [compat] AWSS3 = "0.10, 0.11" Blosc = "0.5, 0.6, 0.7" +ChunkCodecLibZstd = "0.1.1" CodecZlib = "0.6, 0.7" DataStructures = "0.17, 0.18" DateTimes64 = "1" diff --git a/docs/src/reference.md b/docs/src/reference.md index 4cf889ab..7d0e31c7 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -18,5 +18,5 @@ Pages = ["ZGroup.jl"] ```@autodocs Modules = [Zarr] -Pages = ["Compressors/Compressors.jl", "Compressors/blosc.jl", "Compressors/zlib.jl"] +Pages = ["Compressors/Compressors.jl", "Compressors/blosc.jl", "Compressors/zlib.jl", "Compressors/zstd.jl"] ``` diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl index e676c950..18541286 100644 --- a/src/Compressors/Compressors.jl +++ b/src/Compressors/Compressors.jl @@ -48,6 +48,7 @@ const compressortypes = Dict{Union{String,Nothing}, Type{<: Compressor}}() # Include the compressor implementations include("blosc.jl") include("zlib.jl") +include("zstd.jl") # ## Fallback definitions for the compressor interface # Define fallbacks and generic methods for the compressor interface diff --git a/src/Compressors/zstd.jl b/src/Compressors/zstd.jl new file mode 100644 index 00000000..d852294a --- /dev/null +++ b/src/Compressors/zstd.jl @@ -0,0 +1,49 @@ +#= +# Zstd compression + +This file implements a Zstd compressor via ChunkCodecLibZstd.jl. + +=# + +using ChunkCodecLibZstd: ZstdEncodeOptions, encode, decode, ChunkCodecCore + + +""" + ZstdCompressor(;level=0, checksum=false) +Returns a `ZstdCompressor` struct that can serve as a Zarr array compressor. Keyword arguments are: +* `level=0`: the compression level, regular levels are 1 to 22, 0 is a special value for default, there are also even faster negative levels. +* `checksum=false`: flag to enable saving checksums. +""" +struct ZstdCompressor <: Compressor + config::ZstdEncodeOptions +end + +ZstdCompressor(;level=0, checksum::Bool=false) = ZstdCompressor(ZstdEncodeOptions(;compressionLevel=level, checksum)) + +function getCompressor(::Type{ZstdCompressor}, d::Dict) + ZstdCompressor(; + level=get(Returns(0), d, "level"), + checksum=Bool(get(Returns(false), d, "checksum")), + ) +end + +function zuncompress(a, ::ZstdCompressor, T) + result = decode(z.config.codec, a) + _reinterpret(Base.nonmissingtype(T),result) +end + +function zuncompress!(data::DenseArray, compressed, z::ZstdCompressor) + dst = reinterpret(UInt8, vec(data)) + n = length(dst) + n_decoded = something(ChunkCodecCore.try_decode!(z.config.codec, dst, compressed))::Int64 + n_decoded == n || error("expected to decode $n bytes, only got $n_decoded bytes") + data +end + +function zcompress(a, z::ZstdCompressor) + encode(z.config, reinterpret(UInt8, vec(a))) +end + +JSON.lower(z::ZstdCompressor) = Dict("id"=>"zstd", "level" => z.config.compressionLevel, "checksum" => z.config.checksum) + +Zarr.compressortypes["zstd"] = ZstdCompressor \ No newline at end of file diff --git a/test/python.jl b/test/python.jl index 9eb9f4ec..8160d016 100644 --- a/test/python.jl +++ b/test/python.jl @@ -21,7 +21,7 @@ groupattrs = Dict("String attribute"=>"One", "Int attribute"=>5, "Float attribut g = zgroup(pjulia,attrs=groupattrs) # Test all supported data types and compressors -import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, MaxLengthString, +import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, ZstdCompressor, MaxLengthString, Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter, DeltaFilter using Random: randstring numeric_dtypes = (UInt8, UInt16, UInt32, UInt64, @@ -38,7 +38,9 @@ compressors = ( "blosc_autoshuffle"=>BloscCompressor(cname="zstd",shuffle=-1), "blosc_noshuffle"=>BloscCompressor(cname="zstd",shuffle=0), "blosc_bitshuffle"=>BloscCompressor(cname="zstd",shuffle=2), - "zlib"=>ZlibCompressor()) + "zlib"=>ZlibCompressor(), + "zstd"=>ZstdCompressor(), +) filters = ( "fletcher32"=>Fletcher32Filter(), "scale_offset"=>FixedScaleOffsetFilter(offset=1000, scale=10^6, T=Float64, Tenc=Int32), From 865dac78d3440dcef6466d22c5cac03bad81348d Mon Sep 17 00:00:00 2001 From: nhz2 Date: Fri, 7 Mar 2025 15:29:43 -0500 Subject: [PATCH 16/39] fix typo --- src/Compressors/zstd.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compressors/zstd.jl b/src/Compressors/zstd.jl index d852294a..0f28ec47 100644 --- a/src/Compressors/zstd.jl +++ b/src/Compressors/zstd.jl @@ -27,7 +27,7 @@ function getCompressor(::Type{ZstdCompressor}, d::Dict) ) end -function zuncompress(a, ::ZstdCompressor, T) +function zuncompress(a, z::ZstdCompressor, T) result = decode(z.config.codec, a) _reinterpret(Base.nonmissingtype(T),result) end From 6d7dc21d0ac196c5c8fda8ef06e3bbaf473981c5 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 31 Mar 2025 23:56:06 -0400 Subject: [PATCH 17/39] Prototype Zarr v3 support --- src/Compressors/Compressors.jl | 6 +- src/Compressors/zstd.jl | 2 +- src/Storage/Storage.jl | 8 +- src/Storage/versionedstore.jl | 9 ++ src/ZArray.jl | 1 + src/ZGroup.jl | 10 +- src/Zarr.jl | 1 + src/metadata.jl | 24 +++- src/metadata3.jl | 212 +++++++++++++++++++++++++++++++++ test/runtests.jl | 4 +- 10 files changed, 267 insertions(+), 10 deletions(-) create mode 100644 src/metadata3.jl diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl index 18541286..58a80109 100644 --- a/src/Compressors/Compressors.jl +++ b/src/Compressors/Compressors.jl @@ -52,7 +52,9 @@ include("zstd.jl") # ## Fallback definitions for the compressor interface # Define fallbacks and generic methods for the compressor interface -getCompressor(compdict::Dict) = getCompressor(compressortypes[compdict["id"]],compdict) +getCompressor(compdict::Dict) = haskey(compdict, "id") ? + getCompressor(compressortypes[compdict["id"]], compdict) : + getCompressor(compressortypes[compdict["name"]], compdict["configuration"]) getCompressor(::Nothing) = NoCompressor() # Compression when no filter is given @@ -104,4 +106,4 @@ end JSON.lower(::NoCompressor) = nothing -compressortypes[nothing] = NoCompressor \ No newline at end of file +compressortypes[nothing] = NoCompressor diff --git a/src/Compressors/zstd.jl b/src/Compressors/zstd.jl index 0f28ec47..937736f4 100644 --- a/src/Compressors/zstd.jl +++ b/src/Compressors/zstd.jl @@ -46,4 +46,4 @@ end JSON.lower(z::ZstdCompressor) = Dict("id"=>"zstd", "level" => z.config.compressionLevel, "checksum" => z.config.checksum) -Zarr.compressortypes["zstd"] = ZstdCompressor \ No newline at end of file +Zarr.compressortypes["zstd"] = ZstdCompressor diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index 30ff1e15..c76f2cbd 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -108,8 +108,12 @@ function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) att end -is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) -is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) +is_zarr3(s::AbstractStore, p) = isinitialized(s,_concatpath(p,"zarr.json")) +is_zarr2(s::AbstractStore, p) = is_z2array(s, p) || is_z2group(s,p) +is_zgroup(s::AbstractStore, p) = is_z2group(s,p) +is_zarray(s::AbstractStore, p) = is_z2array(s,p) +is_z2group(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) +is_z2array(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) diff --git a/src/Storage/versionedstore.jl b/src/Storage/versionedstore.jl index 45f2bad8..92900500 100644 --- a/src/Storage/versionedstore.jl +++ b/src/Storage/versionedstore.jl @@ -133,3 +133,12 @@ dimension_separator(::AbstractStore) = DS dimension_separator(::VersionedStore{<: Any,S}) where S = S zarr_format(::AbstractStore) = DV zarr_format(::VersionedStore{V}) where V = V + +is_zgroup(s::VersionedStore{3}, p, metadata=getmetadata(s, p, false)) = + isinitialized(s,_concatpath(p,"zarr.json")) && + metadata.node_type == "group" +is_zarray(s::VersionedStore{3}, p, metadata=getmetadata(s, p, false)) = + isinitialized(s,_concatpath(p,"zarr.json")) && + metadata.node_type == "array" + +getmetadata(s::VersionedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) diff --git a/src/ZArray.jl b/src/ZArray.jl index 961c798f..d0623a2a 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -360,6 +360,7 @@ function zcreate(::Type{T},storage::AbstractStore, T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} metadata = Metadata{T2, N, C, typeof(filters), dimension_separator}( 2, + "array", dims, chunks, typestr(T), diff --git a/src/ZGroup.jl b/src/ZGroup.jl index e4d2100b..8d30d43a 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -21,6 +21,11 @@ function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: Abstract for d in subdirs(s,path) dshort = split(d,'/')[end] subpath = _concatpath(path,dshort) + if is_zarr2(s, subpath) + # check for zarr2 first + elseif is_zarr3(s, subpath) + s = set_zarr_format(s, 3) + end if is_zarray(s, subpath) meta = getmetadata(s, subpath, false) if dimension_separator(s) != meta.dimension_separator @@ -43,7 +48,7 @@ end Works like `zopen` with the single difference that no error is thrown when the path or store does not point to a valid zarr array or group, but nothing -is returned instead. +is returned instead. """ function zopen_noerr(s::AbstractStore, mode="r"; consolidated = false, @@ -127,6 +132,9 @@ function storefromstring(s, create=true) elseif isdir(s) # parse metadata to determine store kind temp_store = DirectoryStore(s) + if is_zarr3(temp_store, "") + temp_store = set_zarr_format(temp_store, 3) + end if is_zarray(temp_store, "") meta = getmetadata(temp_store, "", false) store = VersionedStore{meta.zarr_format, meta.dimension_separator}(temp_store) diff --git a/src/Zarr.jl b/src/Zarr.jl index dbdeb9a9..ee40f3bf 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -4,6 +4,7 @@ import JSON import Blosc include("metadata.jl") +include("metadata3.jl") include("Compressors/Compressors.jl") include("Storage/Storage.jl") include("Filters/Filters.jl") diff --git a/src/metadata.jl b/src/metadata.jl index fa564e37..ff90adce 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -104,6 +104,7 @@ https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ struct Metadata{T, N, C, F, S} zarr_format::Int + node_type::String shape::Base.RefValue{NTuple{N, Int}} chunks::NTuple{N, Int} dtype::String # structured data types not yet supported @@ -111,17 +112,21 @@ struct Metadata{T, N, C, F, S} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function Metadata{T2, N, C, F, S}(zarr_format, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + function Metadata{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} #We currently only support version - zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) + # zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) + zarr_format == 3 ? @warn("Zarr v3 support is experimental") : + zarr_format == 2 ? nothing : + throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F, S}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) end function Metadata{T2, N, C, F}( zarr_format, + node_type, shape, chunks, dtype, @@ -133,6 +138,7 @@ struct Metadata{T, N, C, F, S} ) where {T2,N,C,F} return Metadata{T2, N, C, F, dimension_separator}( zarr_format, + node_type, shape, chunks, dtype, @@ -158,6 +164,7 @@ Base.propertynames(m::Metadata) = (fieldnames(Metadata)..., :dimension_separator import Base.== function ==(m1::Metadata, m2::Metadata) m1.zarr_format == m2.zarr_format && + m1.node_type == m2.node_type && m1.shape[] == m2.shape[] && m1.chunks == m2.chunks && m1.dtype == m2.dtype && @@ -172,6 +179,7 @@ end "Construct Metadata based on your data" function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; zarr_format::Integer=2, + node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', @@ -182,6 +190,7 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} Metadata{T2, N, C, typeof(filters), dimension_separator}( zarr_format, + node_type, size(A), chunks, typestr(eltype(A)), @@ -198,6 +207,13 @@ Metadata(s::Union{AbstractString, IO},fill_as_missing) = Metadata(JSON.parse(s), function Metadata(d::AbstractDict, fill_as_missing) # create a Metadata struct from it + if d["zarr_format"] == 3 + return Metadata3(d, fill_as_missing) + end + + # Zarr v2 metadata is only for arrays + node_type = "array" + compdict = d["compressor"] if isnothing(compdict) # try the last filter, for Kerchunk compat @@ -222,6 +238,7 @@ function Metadata(d::AbstractDict, fill_as_missing) Metadata{TU, N, C, F, S}( d["zarr_format"], + node_type, NTuple{N, Int}(d["shape"]) |> reverse, NTuple{N, Int}(d["chunks"]) |> reverse, d["dtype"], @@ -236,6 +253,7 @@ end function JSON.lower(md::Metadata) Dict{String, Any}( "zarr_format" => md.zarr_format, + "node_type" => md.node_type, "shape" => md.shape[] |> reverse, "chunks" => md.chunks |> reverse, "dtype" => md.dtype, diff --git a/src/metadata3.jl b/src/metadata3.jl new file mode 100644 index 00000000..0db438ac --- /dev/null +++ b/src/metadata3.jl @@ -0,0 +1,212 @@ +""" +Prototype Zarr version 3 support +""" + +const typemap3 = Dict{String, DataType}() +foreach([Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) do t + typemap3[lowercase(string(t))] = t +end +typemap3["complex64"] = ComplexF32 +typemap3["complex128"] = ComplexF64 + +function typestr3(s::AbstractString, codecs=nothing) + if !haskey(typemap3, s) + if startswith(s, "r") + num_bits = tryparse(Int, s[2:end]) + if isnothing(num_bits) + raise(ArgumentError("$s is not a known type")) + end + if mod(num_bits, 8) == 0 + return NTuple{num_bits÷8,UInt8} + else + raise(ArgumentError("$s must describe a raw type with bit size that is a multiple of 8 bits")) + end + end + end + return typemap3[s] +end + +function check_keys(d::AbstractDict, keys) + for key in keys + if !haskey(d, key) + throw(ArgumentError("Zarr v3 metadata must have a key called $key")) + end + end +end + +function Metadata3(d::AbstractDict, fill_as_missing) + check_keys(d, ("zarr_format", "node_type")) + + zarr_format = d["zarr_format"]::Int + + node_type = d["node_type"]::String + if node_type ∉ ("group", "array") + throw(ArgumentError("Unknown node_type of $node_type")) + end + + zarr_format == 3 || throw(ArgumentError("Metadata3 only functions if zarr_format == 3")) + + # Groups + if node_type == "group" + # Groups only need zarr_format and node_type + # Optionally they can have attributes + for key in keys(d) + if key ∉ ("zarr_format", "node_type", "attributes") + throw(ArgumentError("Zarr v3 group metadata cannot have a key called $key")) + end + end + + return Metadata{Int,0,Nothing,Nothing,'/'}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing) + end + + # Array keys + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + check_keys(d, mandatory_keys) + for key in keys(d) + if key ∉ mandatory_keys && key ∉ optional_keys + throw(ArgumentError("Zarr v3 metadata cannot have a key called $key")) + end + end + + # Shape + shape = Int.(d["shape"]) + + # Datatype + data_type = d["data_type"]::String + + # Chunk Grid + chunk_grid = d["chunk_grid"] + if chunk_grid["name"] == "regular" + chunks = Int.(chunk_grid["configuration"]["chunk_shape"]) + if length(shape) != length(chunks) + throw(ArgumentError("Shape has rank $(length(shape)) which does not match the chunk_shape rank of $(length(chunk_shape))")) + end + else + throw(ArgumentError("Unknown chunk_grid of name, $(chunk_grid["name"])")) + end + + # Chunk Key Encoding + chunk_key_encoding = d["chunk_key_encoding"] + if chunk_key_encoding["name"] == "default" + elseif chunk_key_encoding["name"] == "v2" + # TODO: Implement v2 chunk_key_encoding by creating a chunk_key_encoding wrapper + throw(ArgumentError("Unknown v2 chunk_key_encoding is unimplemented")) + else + throw(ArgumentError("Unknown chunk_key_encoding of name, $(chunk_key_encoding["name"])")) + end + + + # Codecs + compdict = nothing + + # For transpose codec permutation tracking + default_dim_perm = Tuple(1:length(shape)) + dim_perm = default_dim_perm + + codec_data_type = :array + + function check_codec_data_type(from, to) + codec_data_type == from || + throw(ArgumentError("$codec_name found by codec_data_type is $codec_data_type")) + codec_data_type = to + return nothing + end + + for codec in d["codecs"] + codec_name = codec["name"] + if codec_name == "bytes" + # array -> bytes + check_codec_data_type(:array, :bytes) + codec["configuration"]["endian"] == "little" || + throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) + elseif codec_name == "zstd" + # bytes -> bytes + check_codec_data_type(:bytes, :bytes) + compdict = codec + elseif codec_name == "blosc" + # bytes -> bytes + check_codec_data_type(:bytes, :bytes) + compdict = codec + elseif codec_name == "gzip" + # bytes -> bytes + check_codec_data_type(:bytes, :bytes) + compdict = codec + elseif codec_name == "transpose" + # array -> array + check_codec_data_type(:array, :array) + _dim_order = codec["configuration"]["order"] + if _dim_order == "C" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = 1:length(shape) + elseif _dim_order == "F" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = reverse(1:length(shape)) + else + _dim_order = Int.(codec["configuration"]["order"]) .+ 1 + end + dim_perm = dim_perm[_dim_order] + elseif codec_name == "sharding_indexed" + # array -> bytes + check_codec_data_type(:array, :bytes) + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + elseif codec_name == "crc32c" + # bytes -> bytes + check_codec_data_type(:bytes, :bytes) + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + else + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + end + end + + if dim_perm == default_dim_perm + order = 'C' + elseif dim_perm == reverse(default_dim_perm) + order = 'F' + else + throw(ArgumentError("Dimension permutation of $dim_perm is not implemented")) + end + + compressor = getCompressor(compdict) + + # Filters (NOT IMPLEMENTED) + filters = getfilters(d) + + # Type Parameters + T = typestr3(data_type) + N = length(shape) + C = typeof(compressor) + F = typeof(filters) + + fv = fill_value_decoding(d["fill_value"]::Int, T) + + TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} + + S = only(get(d, "dimension_separator", '/')) + + Metadata{TU, N, C, F, S}( + zarr_format, + node_type, + NTuple{N, Int}(shape) |> reverse, + NTuple{N, Int}(chunks) |> reverse, + data_type, + compressor, + fv, + order, + filters, + ) +end diff --git a/test/runtests.jl b/test/runtests.jl index ad434fe1..21f616f2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -21,6 +21,8 @@ CondaPkg.add("zarr"; version="2.*") @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @test eltype(z.storage.a["0.0"]) === UInt8 + @test z.metadata.zarr_format === 2 + @test z.metadata.node_type === "array" @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' @test z.metadata.chunks === (2, 3) @@ -36,7 +38,6 @@ CondaPkg.add("zarr"; version="2.*") @test :dimension_separator ∈ propertynames(z.metadata) @test_throws ArgumentError zzeros(Int64,2,3, chunks = (0,1)) @test_throws ArgumentError zzeros(Int64,0,-1) - @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), zarr_format = 3) @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), order = 'F') end @@ -75,6 +76,7 @@ CondaPkg.add("zarr"; version="2.*") "shape" => Any[3, 2], "order" => "C", "zarr_format" => 2, + "node_type" => "array", "chunks" => Any[3, 2], "fill_value" => nothing, "compressor" => nothing, From b39445781c8c7a505b7ce2ea7c0b38571160bb09 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 1 Apr 2025 00:19:52 -0400 Subject: [PATCH 18/39] Modify tutorial to match current storage display --- docs/src/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index 6b9f6b37..ee705650 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -197,7 +197,7 @@ Order : C Read-Only : false Compressor : Zarr.BloscCompressor(0, 3, "zstd", 1) Filters : nothing -Store type : Dictionary Storage +Store type : Zarr.VersionedStore{2, '.', Zarr.DictStore}(Dictionary Storage) No. bytes : 400000000 No. bytes stored : 2412289 Storage ratio : 165.81761140559857 From 8e71a3348ffd80dedc0b8e3eb34613ed296551aa Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Thu, 8 May 2025 10:12:09 -0400 Subject: [PATCH 19/39] Ensure configuration key exists --- src/metadata3.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/metadata3.jl b/src/metadata3.jl index 0db438ac..48345a57 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -132,8 +132,10 @@ function Metadata3(d::AbstractDict, fill_as_missing) if codec_name == "bytes" # array -> bytes check_codec_data_type(:array, :bytes) - codec["configuration"]["endian"] == "little" || - throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) + if haskey(codec, "configuration") + codec["configuration"]["endian"] == "little" || + throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) + end elseif codec_name == "zstd" # bytes -> bytes check_codec_data_type(:bytes, :bytes) From 08288fda4e3d2f7281e9ad2ac8bbb22d6042c678 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 19 May 2025 04:14:10 -0400 Subject: [PATCH 20/39] Change VersionedStore to FormattedStore --- src/Storage/Storage.jl | 2 +- src/Storage/formattedstore.jl | 174 ++++++++++++++++++++++++++++++++++ src/Storage/http.jl | 2 +- src/Storage/versionedstore.jl | 135 -------------------------- src/ZArray.jl | 4 +- src/ZGroup.jl | 6 +- test/runtests.jl | 6 +- 7 files changed, 184 insertions(+), 145 deletions(-) create mode 100644 src/Storage/formattedstore.jl delete mode 100644 src/Storage/versionedstore.jl diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index 30ff1e15..0d17bb1a 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -197,7 +197,7 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) #during auto-check of storage format when doing zopen storageregexlist = Pair[] -include("versionedstore.jl") +include("formattedstore.jl") include("directorystore.jl") include("dictstore.jl") include("s3store.jl") diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl new file mode 100644 index 00000000..b60ab2da --- /dev/null +++ b/src/Storage/formattedstore.jl @@ -0,0 +1,174 @@ +# Default Zarr version +const DV = 2 + +# Default Zarr separator + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(version) = version == 2 ? DS2 : + version == 3 ? DS3 : + error("Unknown version: $version") +const DS = default_sep(DV) + +# Chunk Key Encodings for Zarr v3 +# A Char is the separator for the default chunk key encoding +abstract type ChunkKeyEncoding end +struct V2ChunkKeyEncoding{SEP} <: ChunkKeyEncoding end + +""" + FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore + +FormattedStore wraps an AbstractStore to indicate a specific Zarr format. +The path of a chunk depends on the version and chunk key encoding. + +# Type Parameters + +- V: Zarr format version +- CKE: Chunk key encoding or dimension separator. + CKE could be a `Char` or a subtype of `ChunkKeyEncoding`. +- STORE: Type of AbstractStore wrapped + +# Chunk Path Formats + +## Zarr version 2 + +### '.' dimension separator (default) + +Chunks are encoded as "1.2.3" + +### '/' dimension separator + +Chunks are encoded as "1/2/3" + +## Zarr version 3 + +### '/' dimension separator (default) + +Chunks are encoded as "c/1/2/3" + +### '.' dimension separator + +Chunks are encoded as "c.1.2.3" + +### V2ChunkKeyEncoding{SEP} + +See Zarr version 2 +""" +struct FormattedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore + parent::STORE +end +FormattedStore(args...) = FormattedStore{DV,DS}(args...) +FormattedStore(s::FormattedStore) = s +FormattedStore{V}(args...) where V = FormattedStore{V, default_sep(V)}(args...) +FormattedStore{V}(s::FormattedStore{<:Any,S}) where {V,S} = FormattedStore{V, S}(s) +FormattedStore{<: Any, S}(args...) where S = FormattedStore{DV, S}(args...) +FormattedStore{<: Any, S}(s::FormattedStore{V}) where {V,S} = FormattedStore{V, S}(s) +function FormattedStore{V,S}(store::AbstractStore) where {V,S} + return FormattedStore{V,S,typeof(store)}(store) +end +function FormattedStore{V,S}(store::FormattedStore) where {V,S} + p = parent(store) + return FormattedStore{V,S,typeof(p)}(p) +end + +Base.parent(store::FormattedStore) = store.parent + +@inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) +@inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) +@inline citostring(i::CartesianIndex, ::Int, ::Type{V2ChunkKeyEncoding{S}}) where S = citostring(i, 2, S) +citostring(i::CartesianIndex, s::FormattedStore{V, S}) where {V,S} = citostring(i, V, S) + +Base.getindex(s::FormattedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] +Base.delete!(s::FormattedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) +Base.setindex!(s::FormattedStore, v, p, i::CartesianIndex) = s[p, citostring(i,s)]=v + +isinitialized(s::FormattedStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) + +""" +- [`storagesize(d::AbstractStore, p::AbstractString)`](@ref storagesize) +- [`subdirs(d::AbstractStore, p::AbstractString)`](@ref subdirs) +- [`subkeys(d::AbstractStore, p::AbstractString)`](@ref subkeys) +- [`isinitialized(d::AbstractStore, p::AbstractString)`](@ref isinitialized) +- [`storefromstring(::Type{<: AbstractStore}, s, _)`](@ref storefromstring) +- `Base.getindex(d::AbstractStore, i::AbstractString)`: return the data stored in key `i` as a Vector{UInt8} +- `Base.setindex!(d::AbstractStore, v, i::AbstractString)`: write the values in `v` to the key `i` of the given store `d` +""" + +storagesize(d::FormattedStore, p::AbstractString) = storagesize(parent(d), p) +subdirs(d::FormattedStore, p::AbstractString) = subdirs(parent(d), p) +subkeys(d::FormattedStore, p::AbstractString) = subkeys(parent(d), p) +isinitialized(d::FormattedStore, p::AbstractString) = isinitialized(parent(d), p) +storefromstring(::Type{FormattedStore{<: Any, <: Any, STORE}}, s, _) where STORE = FormattedStore{DV,DS}(storefromstring(STORE, s)) +storefromstring(::Type{FormattedStore{V,S}}, s, _) where {V,S} = FormattedStore{DV,DS}(storefromstring(s)) +storefromstring(::Type{FormattedStore{V,S,STORE}}, s, _) where {V,S,STORE} = FormattedStore{V,S,STORE}(storefromstring(STORE, s)) +Base.getindex(d::FormattedStore, i::AbstractString) = getindex(parent(d), i) +Base.setindex!(d::FormattedStore, v, i::AbstractString) = setindex!(parent(d), v, i) +Base.delete!(d::FormattedStore, i::AbstractString) = delete!(parent(d), i) + + +function Base.getproperty(store::FormattedStore{V,S}, sym::Symbol) where {V,S} + if sym == :dimension_separator + return S + elseif sym == :zarr_format + return V + elseif sym ∈ propertynames(getfield(store, :parent)) + # Support forwarding of properties to parent + return getproperty(store.parent, sym) + else + getfield(store, sym) + end +end +function Base.propertynames(store::FormattedStore) + return (:dimension_separator, :zarr_format, fieldnames(typeof(store))..., propertynames(store.parent)...) +end + + +""" + Zarr.set_dimension_separator(store::FormattedStore{V}, sep::Char)::FormattedStore{V,sep} + +Returns a FormattedStore of the same type with the same `zarr_format` parameter, `V`, +but with a dimension separator of `sep`. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_dimension_separator(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), '/')) |> typeof +Zarr.FormattedStore{2, '/',Zarr.DictStore} +``` + +""" +function set_dimension_separator(store::FormattedStore{V}, sep::Char) where V + return FormattedStore{V,sep}(store) +end +function set_dimension_separator(store::AbstractStore, sep::Char) + return FormattedStore{<: Any,sep}(store) +end + +""" + set_zarr_format(::FormattedStore{<: Any, S}, zarr_format::Int)::FormattedStore{zarr_format,S} + +Returns a FormattedStore of the same type with the same `dimension_separator` parameter, `S`, +but with the specified `zarr_format` parameter. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_zarr_format(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), 3)) |> typeof +Zarr.FormattedStore{3, '.', DictStore} +``` + +""" +function set_zarr_format(store::FormattedStore{<: Any, S}, zarr_format::Int) where S + return FormattedStore{zarr_format,S}(store) +end +function set_zarr_format(store::AbstractStore, zarr_format::Int) + return FormattedStore{zarr_format}(store) +end + +dimension_separator(::AbstractStore) = DS +dimension_separator(::FormattedStore{<: Any,S}) where S = S +zarr_format(::AbstractStore) = DV +zarr_format(::FormattedStore{V}) where V = V diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 1263f8c2..980284f2 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -47,7 +47,7 @@ function storefromstring(::Type{<:HTTPStore}, s,_) end if is_zarray(http_store, "") meta = getmetadata(http_store, "", false) - http_store = VersionedStore{meta.zarr_format, meta.dimension_separator}(http_store) + http_store = FormattedStore{meta.zarr_format, meta.dimension_separator}(http_store) end catch err @warn exception=err "Additional metadata was not available for HTTPStore." diff --git a/src/Storage/versionedstore.jl b/src/Storage/versionedstore.jl deleted file mode 100644 index 45f2bad8..00000000 --- a/src/Storage/versionedstore.jl +++ /dev/null @@ -1,135 +0,0 @@ -# Default Zarr version -const DV = 2 - -# Default Zarr separator - -# Default Zarr v2 separator -const DS2 = '.' -# Default Zarr v3 separator -const DS3 = '/' - -default_sep(version) = version == 2 ? DS2 : - version == 3 ? DS3 : - error("Unknown version: $version") -const DS = default_sep(DV) - -# Chunk Key Encodings for Zarr v3 -# A Char is the separator for the default chunk key encoding -struct V2ChunkKeyEncoding{SEP} end - -# Version store differentiates between Zarr format versions -struct VersionedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore - parent::STORE -end -VersionedStore(args...) = VersionedStore{DV,DS}(args...) -VersionedStore(s::VersionedStore) = s -VersionedStore{V}(args...) where V = VersionedStore{V, default_sep(V)}(args...) -VersionedStore{V}(s::VersionedStore{<:Any,S}) where {V,S} = VersionedStore{V, S}(s) -VersionedStore{<: Any, S}(args...) where S = VersionedStore{DV, S}(args...) -VersionedStore{<: Any, S}(s::VersionedStore{V}) where {V,S} = VersionedStore{V, S}(s) -function VersionedStore{V,S}(store::AbstractStore) where {V,S} - return VersionedStore{V,S,typeof(store)}(store) -end -function VersionedStore{V,S}(store::VersionedStore) where {V,S} - p = parent(store) - return VersionedStore{V,S,typeof(p)}(p) -end - -Base.parent(store::VersionedStore) = store.parent - -@inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) -@inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) -@inline citostring(i::CartesianIndex, ::Int, ::Type{V2ChunkKeyEncoding{S}}) where S = citostring(i, 2, S) -citostring(i::CartesianIndex, s::VersionedStore{V, S}) where {V,S} = citostring(i, V, S) - -Base.getindex(s::VersionedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] -Base.delete!(s::VersionedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) -Base.setindex!(s::VersionedStore, v, p, i::CartesianIndex) = s[p, citostring(i,s)]=v - -isinitialized(s::VersionedStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) - -""" -- [`storagesize(d::AbstractStore, p::AbstractString)`](@ref storagesize) -- [`subdirs(d::AbstractStore, p::AbstractString)`](@ref subdirs) -- [`subkeys(d::AbstractStore, p::AbstractString)`](@ref subkeys) -- [`isinitialized(d::AbstractStore, p::AbstractString)`](@ref isinitialized) -- [`storefromstring(::Type{<: AbstractStore}, s, _)`](@ref storefromstring) -- `Base.getindex(d::AbstractStore, i::AbstractString)`: return the data stored in key `i` as a Vector{UInt8} -- `Base.setindex!(d::AbstractStore, v, i::AbstractString)`: write the values in `v` to the key `i` of the given store `d` -""" - -storagesize(d::VersionedStore, p::AbstractString) = storagesize(parent(d), p) -subdirs(d::VersionedStore, p::AbstractString) = subdirs(parent(d), p) -subkeys(d::VersionedStore, p::AbstractString) = subkeys(parent(d), p) -isinitialized(d::VersionedStore, p::AbstractString) = isinitialized(parent(d), p) -storefromstring(::Type{VersionedStore{<: Any, <: Any, STORE}}, s, _) where STORE = VersionedStore{DV,DS}(storefromstring(STORE, s)) -storefromstring(::Type{VersionedStore{V,S}}, s, _) where {V,S} = VersionedStore{DV,DS}(storefromstring(s)) -storefromstring(::Type{VersionedStore{V,S,STORE}}, s, _) where {V,S,STORE} = VersionedStore{V,S,STORE}(storefromstring(STORE, s)) -Base.getindex(d::VersionedStore, i::AbstractString) = getindex(parent(d), i) -Base.setindex!(d::VersionedStore, v, i::AbstractString) = setindex!(parent(d), v, i) -Base.delete!(d::VersionedStore, i::AbstractString) = delete!(parent(d), i) - - -function Base.getproperty(store::VersionedStore{V,S}, sym::Symbol) where {V,S} - if sym == :dimension_separator - return S - elseif sym == :zarr_format - return V - elseif sym ∈ propertynames(getfield(store, :parent)) - # Support forwarding of properties to parent - return getproperty(store.parent, sym) - else - getfield(store, sym) - end -end -function Base.propertynames(store::VersionedStore) - return (:dimension_separator, :zarr_format, fieldnames(typeof(store))..., propertynames(store.parent)...) -end - - -""" - Zarr.set_dimension_separator(store::VersionedStore{V}, sep::Char)::VersionedStore{V,sep} - -Returns a VersionedStore of the same type with the same `zarr_format` parameter, `V`, -but with a dimension separator of `sep`. Note that this does not mutate the original store. - -# Examples - -``` -julia> Zarr.set_dimension_separator(Zarr.VersionedStore{2, '.'}(Zarr.DictStore(), '/')) |> typeof -Zarr.VersionedStore{2, '/',Zarr.DictStore} -``` - -""" -function set_dimension_separator(store::VersionedStore{V}, sep::Char) where V - return VersionedStore{V,sep}(store) -end -function set_dimension_separator(store::AbstractStore, sep::Char) - return VersionedStore{<: Any,sep}(store) -end - -""" - set_zarr_format(::VersionedStore{<: Any, S}, zarr_format::Int)::VersionedStore{zarr_format,S} - -Returns a VersionedStore of the same type with the same `dimension_separator` parameter, `S`, -but with the specified `zarr_format` parameter. Note that this does not mutate the original store. - -# Examples - -``` -julia> Zarr.set_zarr_format(Zarr.VersionedStore{2, '.'}(Zarr.DictStore(), 3)) |> typeof -Zarr.VersionedStore{3, '.', DictStore} -``` - -""" -function set_zarr_format(store::VersionedStore{<: Any, S}, zarr_format::Int) where S - return VersionedStore{zarr_format,S}(store) -end -function set_zarr_format(store::AbstractStore, zarr_format::Int) - return VersionedStore{zarr_format}(store) -end - -dimension_separator(::AbstractStore) = DS -dimension_separator(::VersionedStore{<: Any,S}) where S = S -zarr_format(::AbstractStore) = DV -zarr_format(::VersionedStore{V}) where V = V diff --git a/src/ZArray.jl b/src/ZArray.jl index 961c798f..951639f8 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -326,9 +326,9 @@ function zcreate(::Type{T}, dims::Integer...; end if path===nothing - store = VersionedStore{DV, dimension_separator}(DictStore()) + store = FormattedStore{DV, dimension_separator}(DictStore()) else - store = VersionedStore{DV, dimension_separator}(DirectoryStore(joinpath(path,name))) + store = FormattedStore{DV, dimension_separator}(DirectoryStore(joinpath(path,name))) end zcreate(T, store, dims...; kwargs...) end diff --git a/src/ZGroup.jl b/src/ZGroup.jl index e4d2100b..0164096f 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -123,15 +123,15 @@ function storefromstring(s, create=true) end end if create - return VersionedStore(DirectoryStore(s)), "" + return FormattedStore(DirectoryStore(s)), "" elseif isdir(s) # parse metadata to determine store kind temp_store = DirectoryStore(s) if is_zarray(temp_store, "") meta = getmetadata(temp_store, "", false) - store = VersionedStore{meta.zarr_format, meta.dimension_separator}(temp_store) + store = FormattedStore{meta.zarr_format, meta.dimension_separator}(temp_store) else - store = VersionedStore(temp_store) + store = FormattedStore(temp_store) end return store, "" else diff --git a/test/runtests.jl b/test/runtests.jl index ad434fe1..c01f441e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,7 @@ CondaPkg.add("zarr"; version="2.*") @testset "fields" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.VersionedStore{2, '.', Zarr.DictStore}} + Zarr.FormattedStore{2, '.', Zarr.DictStore}} @test :a ∈ propertynames(z.storage) @test length(z.storage.a) === 3 @@ -43,7 +43,7 @@ CondaPkg.add("zarr"; version="2.*") @testset "methods" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.VersionedStore{2, '.', Zarr.DictStore}} + Zarr.FormattedStore{2, '.', Zarr.DictStore}} @test eltype(z) === Int64 @test ndims(z) === 2 @@ -63,7 +63,7 @@ CondaPkg.add("zarr"; version="2.*") compressor=Zarr.NoCompressor()) @test z.metadata.compressor === Zarr.NoCompressor() - @test z.storage === Zarr.VersionedStore{2 ,'.'}(Zarr.DirectoryStore("$dir/$name")) + @test z.storage === Zarr.FormattedStore{2 ,'.'}(Zarr.DirectoryStore("$dir/$name")) @test isdir("$dir/$name") @test ispath("$dir/$name/.zarray") @test ispath("$dir/$name/.zattrs") From 0046e14250566dee65c06159d437a8d8bc26cf3c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 07:58:58 -0400 Subject: [PATCH 21/39] Add {get,write}attrs for FormattedStore{3} --- src/Storage/formattedstore.jl | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl index 51fc6c23..df3e01a7 100644 --- a/src/Storage/formattedstore.jl +++ b/src/Storage/formattedstore.jl @@ -181,3 +181,36 @@ is_zarray(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = metadata.node_type == "array" getmetadata(s::FormattedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) + +function getattrs(s::FormattedStore{3}) + md = s[p,"zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) + return get(md, "attributes", Dict{String, Any}()) + end +end + +function writeattrs(s::FormattedStore{3}, p, att::Dict; indent_json::Bool= false) + # This is messy, we need to open zarr.json and replace the attributes section + md = s[p,"zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) + end + md = Dict(md) + md["attributes"] = att + + b = IOBuffer() + + if indent_json + JSON.print(b,att,4) + else + JSON.print(b,att) + end + + s[p,"zarr.json"] = take!(b) + att +end From 34afb272cc817da66e095a5cc241767b990ab5f6 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 08:13:45 -0400 Subject: [PATCH 22/39] Add separator function for V2ChunkKeyEncoding --- src/Storage/formattedstore.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl index df3e01a7..efd27ec2 100644 --- a/src/Storage/formattedstore.jl +++ b/src/Storage/formattedstore.jl @@ -17,6 +17,8 @@ const DS = default_sep(DV) # A Char is the separator for the default chunk key encoding abstract type ChunkKeyEncoding end struct V2ChunkKeyEncoding{SEP} <: ChunkKeyEncoding end +separator(c::Char) = c +separator(v2cke::V2ChunkKeyEncoding{SEP}) where SEP = SEP """ FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore From 514ba87582b086cb67441bbcc4b953f36c6bceef Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 10:28:54 -0400 Subject: [PATCH 23/39] Fix formattedstore, add writemetadata --- src/Storage/formattedstore.jl | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl index efd27ec2..750ec6ec 100644 --- a/src/Storage/formattedstore.jl +++ b/src/Storage/formattedstore.jl @@ -183,6 +183,18 @@ is_zarray(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = metadata.node_type == "array" getmetadata(s::FormattedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) +function writemetadata(s::FormattedStore{3}, p, m::Metadata; indent_json::Bool= false) + met = IOBuffer() + + if indent_json + JSON.print(met,m,4) + else + JSON.print(met,m) + end + + s[p,"zarr.json"] = take!(met) + m +end function getattrs(s::FormattedStore{3}) md = s[p,"zarr.json"] @@ -208,9 +220,9 @@ function writeattrs(s::FormattedStore{3}, p, att::Dict; indent_json::Bool= false b = IOBuffer() if indent_json - JSON.print(b,att,4) + JSON.print(b,md,4) else - JSON.print(b,att) + JSON.print(b,md) end s[p,"zarr.json"] = take!(b) From 4ce5895d2e43534c36ddffe17871d745f9e8d089 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 10:29:29 -0400 Subject: [PATCH 24/39] Attempt to allow for Zarr v3 array creation TODO: Fix Zarr v3 type strings --- src/Compressors/Compressors.jl | 1 + src/Compressors/v3.jl | 58 ++++++++++++++++++++++ src/ZArray.jl | 13 +++-- src/ZGroup.jl | 2 +- src/metadata.jl | 3 ++ src/metadata3.jl | 89 ++++++++++++++++++++++++++++++++-- test/storage.jl | 6 +++ 7 files changed, 163 insertions(+), 9 deletions(-) create mode 100644 src/Compressors/v3.jl diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl index 58a80109..c647eff4 100644 --- a/src/Compressors/Compressors.jl +++ b/src/Compressors/Compressors.jl @@ -49,6 +49,7 @@ const compressortypes = Dict{Union{String,Nothing}, Type{<: Compressor}}() include("blosc.jl") include("zlib.jl") include("zstd.jl") +include("v3.jl") # ## Fallback definitions for the compressor interface # Define fallbacks and generic methods for the compressor interface diff --git a/src/Compressors/v3.jl b/src/Compressors/v3.jl new file mode 100644 index 00000000..6ee82385 --- /dev/null +++ b/src/Compressors/v3.jl @@ -0,0 +1,58 @@ +""" + Compressor v3{C <: Compressor} <: Compressor + +Wrapper to indicate Zarr v3 of a compressor +""" +struct Compressor_v3{C} <: Compressor + parent::C +end +Base.parent(c::Compressor_v3) = c.parent + +function zuncompress(a, z::Compressor_v3, T) + zuncompress(a, parent(z), T) +end + +function zuncompress!(data::DenseArray, compressed, z::Compressor_v3) + zuncompress!(data, compressed, parent(z)) +end + +function zcompress(a, z::Compressor_v3) + zcompress(a, parent(z)) +end + + +function JSON.lower(c::Compressor_v3{BloscCompressor}) + p = parent(c) + return Dict( + "name" => "blosc", + "configuration" => Dict( + "cname" => p.cname, + "clevel" => p.clevel, + "shuffle" => p.shuffle, +# TODO: Evalute if we can encode typesize +# "typesize" => p.typesize, + "blocksize" => p.blocksize + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZlibCompressor}) + p = parent(c) + return Dict( + "name" => "gzip", + "configuration" => Dict( + "level" => p.clevel + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZstdCompressor}) + p = parent(c) + return Dict( + "name" => "zstd", + "configuration" => Dict( + "level" => p.config.compressionlevel, + "checksum" => p.config.checksum + ) + ) +end diff --git a/src/ZArray.jl b/src/ZArray.jl index 7430c40b..3dab94c3 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -302,6 +302,7 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `path=""` directory name to store a persistent array. If left empty, an in-memory array will be created * `name=""` name of the zarr array, defaults to the directory name +* `zarr_format`=$(DV) Zarr format version (2 or 3) * `storagetype` determines the storage to use, current options are `DirectoryStore` or `DictStore` * `chunks=dims` size of the individual array chunks, must be a tuple of length `length(dims)` * `fill_value=nothing` value to represent missing values @@ -316,7 +317,8 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` function zcreate(::Type{T}, dims::Integer...; name="", path=nothing, - dimension_separator='.', + zarr_format=DV, + dimension_separator=default_sep(zarr_format), kwargs... ) where T @@ -326,16 +328,17 @@ function zcreate(::Type{T}, dims::Integer...; end if path===nothing - store = FormattedStore{DV, dimension_separator}(DictStore()) + store = FormattedStore{zarr_format, dimension_separator}(DictStore()) else - store = FormattedStore{DV, dimension_separator}(DirectoryStore(joinpath(path,name))) + store = FormattedStore{zarr_format, dimension_separator}(DirectoryStore(joinpath(path,name))) end - zcreate(T, store, dims...; kwargs...) + zcreate(T, store, dims...; zarr_format, kwargs...) end function zcreate(::Type{T},storage::AbstractStore, dims...; path = "", + zarr_format = DV, chunks=dims, fill_value=nothing, fill_as_missing=false, @@ -359,7 +362,7 @@ function zcreate(::Type{T},storage::AbstractStore, C = typeof(compressor) T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} metadata = Metadata{T2, N, C, typeof(filters), dimension_separator}( - 2, + zarr_format, "array", dims, chunks, diff --git a/src/ZGroup.jl b/src/ZGroup.jl index 2db2b003..0d748038 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -153,7 +153,7 @@ end Create a new zgroup in the store `s` """ function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bool= false) - d = Dict("zarr_format"=>2) + d = Dict("zarr_format"=>DV) isemptysub(s, path) || error("Store is not empty") b = IOBuffer() diff --git a/src/metadata.jl b/src/metadata.jl index ff90adce..530b2965 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -251,6 +251,9 @@ end "Describes how to lower Metadata to JSON, used in json(::Metadata)" function JSON.lower(md::Metadata) + if md.zarr_format == 3 + return lower3(md) + end Dict{String, Any}( "zarr_format" => md.zarr_format, "node_type" => md.node_type, diff --git a/src/metadata3.jl b/src/metadata3.jl index 48345a57..5bc0b92f 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -104,8 +104,6 @@ function Metadata3(d::AbstractDict, fill_as_missing) chunk_key_encoding = d["chunk_key_encoding"] if chunk_key_encoding["name"] == "default" elseif chunk_key_encoding["name"] == "v2" - # TODO: Implement v2 chunk_key_encoding by creating a chunk_key_encoding wrapper - throw(ArgumentError("Unknown v2 chunk_key_encoding is unimplemented")) else throw(ArgumentError("Unknown chunk_key_encoding of name, $(chunk_key_encoding["name"])")) end @@ -198,7 +196,16 @@ function Metadata3(d::AbstractDict, fill_as_missing) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} - S = only(get(d, "dimension_separator", '/')) + cke_configuration = get(chunk_key_encoding, "configuration") do + Dict{String,Any} + end + # V2 uses '.' while default CKE uses '/' by default + if chunk_key_encoding["name"] == "v2" + separator = only(get(cke_configuration, "separator", '.')) + S = V2ChunkKeyEncoding{separator}() + elseif chunk_key_encoding["name"] == "default" + S = only(get(cke_configuration, "separator", '/')) + end Metadata{TU, N, C, F, S}( zarr_format, @@ -212,3 +219,79 @@ function Metadata3(d::AbstractDict, fill_as_missing) filters, ) end + +function lower3(md::Metadata) + md.zarr_format == 3 || throw(ArgumentError("lower3 only applies when zarr_format is 3")) + + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + chunk_grid = Dict{String,Any}( + "name" => "regular", + "configuration" => Dict{String,Any}( + "chunk_shape" => md.chunks |> reverse + ) + ) + + chunk_key_encoding = Dict{String,Any}( + "name" => isa(md.dimension_separator, Char) ? "default" : + isa(md.dimension_separator, V2ChunkKeyEncoding) ? "v2" : + error("Unknown encoding for $(md.dimension_separator)"), + "configuration" => Dict{String,Any}( + "separator" => separator(md.dimension_separator) + ) + ) + + # TODO: Incorporate filters + codecs = Dict{String,Any}[] + + default_dim_perm = Tuple(0:length(md.shape[])-1) + + # Encode the order as a single transpose codec (array to array) + push!(codecs, + Dict{String,Any}( + "name" => "transpose", + "configuration" => Dict( + "order" => md.order == 'C' ? default_dim_perm : + md.order == 'F' ? reverse(default_dim_perm) : + error("Unable to encode order $(md.order)") + ) + ) + ) + + # Convert from array to bytes + push!(codecs, + Dict{String,Any}( + "name" => "bytes", + "configuration" => Dict{String, Any}( + "endian" => "little" + ) + ) + ) + # Compress bytes to bytes + push!(codecs, JSON.lower(Compressor_v3(md.compressor))) + + Dict{String, Any}( + "zarr_format" => md.zarr_format, + "node_type" => md.node_type, + "shape" => md.shape[] |> reverse, + "data_type" => typestr3(md.dtype), + "chunk_grid" => chunk_grid, + "chunk_key_encoding" => chunk_key_encoding, + "fill_value" => fill_value_encoding(md.fill_value), + "codecs" => codecs + ) +end diff --git a/test/storage.jl b/test/storage.jl index 1ef34d72..162d3017 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -186,6 +186,7 @@ end @testset "Minio S3 storage" begin + @info "Testing Minio S3 storage" A = fill(1.0, 30, 20) chunks = (5,10) metadata = Zarr.Metadata(A, chunks; fill_value=-1.5) @@ -206,6 +207,7 @@ end end @testset "AWS S3 Storage" begin + @info "Testing AWS S3 storage" Zarr.AWSS3.AWS.global_aws_config(Zarr.AWSS3.AWS.AWSConfig(creds=nothing, region="us-west-2")) S3, p = Zarr.storefromstring("s3://mur-sst/zarr-v1") @test Zarr.is_zgroup(S3, p) @@ -218,6 +220,7 @@ end end @testset "GCS Storage" begin + @info "Testing GCS storage" for s in ( "gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", "https://storage.googleapis.com/cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", @@ -239,6 +242,7 @@ end end @testset "HTTP Storage" begin + @info "Testing HTTP Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -274,6 +278,7 @@ end end @testset "Zip Storage" begin + @info "Testing Zip Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -294,4 +299,5 @@ end Zarr.writezip(io, ds) Zarr.ZipStore(take!(io)) end + @info "Finished testing ZipStore" end From 3298a5cca340a7c2cf307dc7fd48be1941b3affa Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 12:48:17 -0400 Subject: [PATCH 25/39] Fix Zarr v3 array creation --- src/ZArray.jl | 3 +++ src/metadata3.jl | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/ZArray.jl b/src/ZArray.jl index 3dab94c3..c9ebc87e 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -360,6 +360,9 @@ function zcreate(::Type{T},storage::AbstractStore, length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) C = typeof(compressor) + if fill_value === nothing && zarr_format == 3 + fill_value = zero(T) + end T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} metadata = Metadata{T2, N, C, typeof(filters), dimension_separator}( zarr_format, diff --git a/src/metadata3.jl b/src/metadata3.jl index 5bc0b92f..5509b2d7 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -9,6 +9,14 @@ end typemap3["complex64"] = ComplexF32 typemap3["complex128"] = ComplexF64 +function typestr3(t::Type) + return lowercase(string(t)) +end +# TODO: Check raw types +function typestr3(::Type{NTuple{N,UInt8}}) where {N} + return "r$(N*8)" +end + function typestr3(s::AbstractString, codecs=nothing) if !haskey(typemap3, s) if startswith(s, "r") @@ -192,7 +200,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) C = typeof(compressor) F = typeof(filters) - fv = fill_value_decoding(d["fill_value"]::Int, T) + fv = fill_value_decoding(d["fill_value"], T)::T TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} @@ -220,7 +228,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) ) end -function lower3(md::Metadata) +function lower3(md::Metadata{T}) where T md.zarr_format == 3 || throw(ArgumentError("lower3 only applies when zarr_format is 3")) mandatory_keys = [ @@ -288,10 +296,10 @@ function lower3(md::Metadata) "zarr_format" => md.zarr_format, "node_type" => md.node_type, "shape" => md.shape[] |> reverse, - "data_type" => typestr3(md.dtype), + "data_type" => typestr3(T), "chunk_grid" => chunk_grid, "chunk_key_encoding" => chunk_key_encoding, - "fill_value" => fill_value_encoding(md.fill_value), + "fill_value" => fill_value_encoding(md.fill_value)::T, "codecs" => codecs ) end From 646ba9c5340824ffbc091e187a20b4814af23f72 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 2 Jun 2025 19:52:00 -0400 Subject: [PATCH 26/39] Implement CRC32c Zarr v3 codec --- Project.toml | 2 + src/Codecs/Codecs.jl | 49 ++++++++++++++++++++ src/Codecs/V3/V3.jl | 103 +++++++++++++++++++++++++++++++++++++++++++ src/Zarr.jl | 1 + 4 files changed, 155 insertions(+) create mode 100644 src/Codecs/Codecs.jl create mode 100644 src/Codecs/V3/V3.jl diff --git a/Project.toml b/Project.toml index 54db8d2e..46f8ece5 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.9.4" [deps] AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" Blosc = "a74b3585-a348-5f62-a45c-50e91977d574" +CRC32c = "8bf52ea8-c179-5cab-976a-9e18b702a9bc" ChunkCodecLibZstd = "55437552-ac27-4d47-9aa3-63184e8fd398" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -23,6 +24,7 @@ ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c" [compat] AWSS3 = "0.10, 0.11" Blosc = "0.5, 0.6, 0.7" +CRC32c = "1.11.0" ChunkCodecLibZstd = "0.1.1" CodecZlib = "0.6, 0.7" DataStructures = "0.17, 0.18" diff --git a/src/Codecs/Codecs.jl b/src/Codecs/Codecs.jl new file mode 100644 index 00000000..ec6e6205 --- /dev/null +++ b/src/Codecs/Codecs.jl @@ -0,0 +1,49 @@ +module Codecs + +using JSON: JSON + +""" + abstract type Codec + +The abstract supertype for all Zarr codecs + +## Interface + +All subtypes of `Codec` SHALL implement the following methods: + +- `zencode(a, c::Codec)`: compress the array `a` using the codec `c`. +- `zdecode(a, c::Codec, T)`: decode the array `a` using the codec `c` + and return an array of type `T`. +- `JSON.lower(c::Codec)`: return a JSON representation of the codec `c`, which + follows the Zarr specification for that codec. +- `getCodec(::Type{<:Codec}, d::Dict)`: return a codec object from a given + dictionary `d` which contains the codec's parameters according to the Zarr spec. + +Subtypes of `Codec` MAY also implement the following methods: + +- `zencode!(encoded, data, c::Codec)`: encode the array `data` using the + codec `c` and store the result in the array `encoded`. +- `zdecode!(data, encoded, c::Codec)`: decode the array `encoded` + using the codec `c` and store the result in the array `data`. + +Finally, an entry MUST be added to the `VN.codectypes` dictionary for each codec type where N is the +Zarr format version. +This must also follow the Zarr specification's name for that compressor. The name of the compressor +is the key, and the value is the compressor type (e.g. `BloscCodec` or `NoCodec`). + +For example, the Blosc codec is named "blosc" in the Zarr spec, so the entry for [`BloscCodec`](@ref) +must be added to `codectypes` as `codectypes["blosc"] = BloscCodec`. +""" + +abstract type Codec end + +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +JSON.lower(c::Codec) = error("Unimplemented") +getCodec(::Type{<:Codec}, d::Dict) = error("Unimplemented") + +include("V3/V3.jl") + +end diff --git a/src/Codecs/V3/V3.jl b/src/Codecs/V3/V3.jl new file mode 100644 index 00000000..710946c3 --- /dev/null +++ b/src/Codecs/V3/V3.jl @@ -0,0 +1,103 @@ +module V3Codecs + +import ..Codecs: zencode, zdecode, zencode!, zdecode! +using CRC32c: CRC32c + +abstract type V3Codec{In,Out} end +const codectypes = Dict{String, V3Codec}() + +@enum BloscCompressor begin + lz4 + lz4hc + blosclz + zstd + snappy + zlib +end + +@enum BloscShuffle begin + noshuffle + shuffle + bitshuffle +end + +struct BloscCodec <: V3Codec{:bytes, :bytes} + cname::BloscCompressor + clevel::Int64 + shuffle::BloscShuffle + typesize::UInt8 + blocksize::UInt +end +name(::BloscCodec) = "blosc" + +struct BytesCodec <: V3Codec{:array, :bytes} +end +name(::BytesCodec) = "bytes" + +struct CRC32cCodec <: V3Codec{:bytes, :bytes} +end +name(::CRC32cCodec) = "crc32c" + +struct GzipCodec <: V3Codec{:bytes, :bytes} +end +name(::GzipCodec) = "gzip" + + +#= +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +=# + +function crc32c_stream!(output::IO, input::IO; buffer = Vector{UInt8}(undef, 1024*32)) + hash::UInt32 = 0x00000000 + while(bytesavailable(input) > 0) + sized_buffer = @view(buffer[1:min(length(buffer), bytesavailable(input))]) + read!(input, sized_buffer) + write(output, sized_buffer) + hash = CRC32c.crc32c(sized_buffer, hash) + end + return hash +end +function zencode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=false) + zencode!(output, input, c) + return take!(output) +end +function zencode!(output::IO, input::IO, c::CRC32cCodec) + hash = crc32c_stream!(output, input) + write(output, hash) + return output +end +function zdecode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=true) + zdecode!(output, input, c) + return take!(output) +end +function zdecode!(output::IOBuffer, input::IOBuffer, c::CRC32cCodec) + input_vec = take!(input) + truncated_input = IOBuffer(@view(input_vec[1:end-4]); read=true, write=false) + hash = crc32c_stream!(output, truncated_input) + if input_vec[end-3:end] != reinterpret(UInt8, [hash]) + throw(IOError("CRC32c hash does not match")) + end + return output +end + +struct ShardingCodec{N} <: V3Codec{:array, :bytes} + chunk_shape::NTuple{N,Int} + codecs::Vector{V3Codec} + index_codecs::Vector{V3Codec} + index_location::Symbol +end +name(::ShardingCodec) = "sharding_indexed" + +struct TransposeCodec <: V3Codec{:array, :array} +end +name(::TransposeCodec) = "transpose" + + +end diff --git a/src/Zarr.jl b/src/Zarr.jl index ee40f3bf..1783bdf9 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -6,6 +6,7 @@ import Blosc include("metadata.jl") include("metadata3.jl") include("Compressors/Compressors.jl") +include("Codecs/Codecs.jl") include("Storage/Storage.jl") include("Filters/Filters.jl") include("ZArray.jl") From 07352f3adf3f1bc69a78e0fcdf9e002390154c1a Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Tue, 3 Jun 2025 20:12:45 -0400 Subject: [PATCH 27/39] Fix spelling of Evaluate in comment --- src/Compressors/v3.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compressors/v3.jl b/src/Compressors/v3.jl index 6ee82385..955c116a 100644 --- a/src/Compressors/v3.jl +++ b/src/Compressors/v3.jl @@ -29,7 +29,7 @@ function JSON.lower(c::Compressor_v3{BloscCompressor}) "cname" => p.cname, "clevel" => p.clevel, "shuffle" => p.shuffle, -# TODO: Evalute if we can encode typesize +# TODO: Evaluate if we can encode typesize # "typesize" => p.typesize, "blocksize" => p.blocksize ) From 42b25190d7f79a272711c05070e0987cbb91df21 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 27 Aug 2025 15:43:14 -0400 Subject: [PATCH 28/39] Fix default chunk_key_encoding --- src/metadata3.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metadata3.jl b/src/metadata3.jl index 5509b2d7..1678927b 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -205,7 +205,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} cke_configuration = get(chunk_key_encoding, "configuration") do - Dict{String,Any} + Dict{String,Any}() end # V2 uses '.' while default CKE uses '/' by default if chunk_key_encoding["name"] == "v2" From 32da02317f2c9dfe50c5f18cf30dca540b43b215 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 16:58:05 +0100 Subject: [PATCH 29/39] dont --- .gitignore | 2 ++ test/CondaPkg.toml | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) delete mode 100644 test/CondaPkg.toml diff --git a/.gitignore b/.gitignore index 1929269e..da919e29 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ Manifest.toml docs/build *.zarr .CondaPkg +.vscode +test/CondaPkg.toml \ No newline at end of file diff --git a/test/CondaPkg.toml b/test/CondaPkg.toml deleted file mode 100644 index 4daad6a2..00000000 --- a/test/CondaPkg.toml +++ /dev/null @@ -1,3 +0,0 @@ -[deps] -zarr = ">=2.13,<3" -python = ">=3.7,<4" From 1df9efe4e25729596058cb8cdeaffde49a0e8898 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 17:40:27 +0100 Subject: [PATCH 30/39] adds type AbstractMetadata --- src/Storage/Storage.jl | 2 +- src/Storage/formattedstore.jl | 2 +- src/ZArray.jl | 51 +++++++++++------ src/metadata.jl | 103 ++++++++++++++++++++++++---------- src/metadata3.jl | 36 ++++++------ 5 files changed, 126 insertions(+), 68 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index 2fb13287..d45b600c 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -120,7 +120,7 @@ isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) isinitialized(s::AbstractStore, i) = s[i] !== nothing getmetadata(s::AbstractStore, p,fill_as_missing) = Metadata(String(maybecopy(s[p,".zarray"])),fill_as_missing) -function writemetadata(s::AbstractStore, p, m::Metadata; indent_json::Bool= false) +function writemetadata(s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool= false) met = IOBuffer() if indent_json diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl index 750ec6ec..782c1806 100644 --- a/src/Storage/formattedstore.jl +++ b/src/Storage/formattedstore.jl @@ -183,7 +183,7 @@ is_zarray(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = metadata.node_type == "array" getmetadata(s::FormattedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) -function writemetadata(s::FormattedStore{3}, p, m::Metadata; indent_json::Bool= false) +function writemetadata(s::FormattedStore{3}, p, m::AbstractMetadata; indent_json::Bool= false) met = IOBuffer() if indent_json diff --git a/src/ZArray.jl b/src/ZArray.jl index bb200044..45c0eb98 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -33,7 +33,7 @@ Base.IndexStyle(::Type{<:SenMissArray})=Base.IndexLinear() # Currently this is not an AbstractArray, because indexing single elements is # would be really slow, although most AbstractArray interface functions are implemented struct ZArray{T, N, C<:Compressor, S<:AbstractStore} <: AbstractDiskArray{T,N} - metadata::Metadata{T, N, C} + metadata::AbstractMetadata{T, N, C} storage::S path::String attrs::Dict @@ -42,11 +42,11 @@ end Base.eltype(::ZArray{T}) where {T} = T Base.ndims(::ZArray{<:Any,N}) where {N} = N -Base.size(z::ZArray) = z.metadata.shape[] -Base.size(z::ZArray,i) = z.metadata.shape[][i] -Base.length(z::ZArray) = prod(z.metadata.shape[]) -Base.lastindex(z::ZArray,n) = size(z,n) -Base.lastindex(z::ZArray{<:Any,1}) = size(z,1) +Base.size(z::ZArray{<:Any,N}) where {N} = z.metadata.shape[]::NTuple{N, Int} +Base.size(z::ZArray{<:Any,N}, i::Integer) where {N} = z.metadata.shape[][i]::Int +Base.length(z::ZArray) = prod(z.metadata.shape[])::Int +Base.lastindex(z::ZArray{<:Any,N}, n::Integer) where {N} = size(z, n)::Int +Base.lastindex(z::ZArray{<:Any,1}) = size(z, 1)::Int function Base.show(io::IO,z::ZArray) print(io, "ZArray{", eltype(z) ,"} of size ",join(string.(size(z)), " x ")) @@ -365,17 +365,34 @@ function zcreate(::Type{T},storage::AbstractStore, fill_value = zero(T) end T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - metadata = Metadata{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - "array", - dims, - chunks, - typestr(T), - compressor, - fill_value, - 'C', - filters, - ) + metadata = if zarr_format == 2 + MetadataV2{T2, N, C, typeof(filters), dimension_separator}( + zarr_format, + "array", + dims, + chunks, + typestr(T), + compressor, + fill_value, + 'C', + filters, + ) + elseif zarr_format == 3 + @warn("Zarr v3 support is experimental") + MetadataV3{T2, N, C, typeof(filters), dimension_separator}( + zarr_format, + "array", + dims, + chunks, + typestr(T), + compressor, + fill_value, + 'C', + filters, + ) + else + throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + end isemptysub(storage,path) || error("$storage $path is not empty") diff --git a/src/metadata.jl b/src/metadata.jl index 733018ec..81debcca 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -88,7 +88,7 @@ end Each array requires essential configuration metadata to be stored, enabling correct interpretation of the stored data. This metadata is encoded using JSON and stored as the -value of the “.zarray” key within an array store. +value of the ".zarray" key within an array store. # Type Parameters * T - element type of the array @@ -101,7 +101,10 @@ value of the “.zarray” key within an array store. https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ -struct Metadata{T, N, C, F, S} +abstract type AbstractMetadata{T, N, C, F, S} end + +"""Metadata for Zarr version 2 arrays""" +struct MetadataV2{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} zarr_format::Int node_type::String shape::Base.RefValue{NTuple{N, Int}} @@ -111,19 +114,15 @@ struct Metadata{T, N, C, F, S} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function Metadata{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} - #We currently only support version - # zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) - zarr_format == 3 ? @warn("Zarr v3 support is experimental") : - zarr_format == 2 ? nothing : - throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + function MetadataV2{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + zarr_format == 2 || throw(ArgumentError("MetadataV2 only functions if zarr_format == 2")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) end - function Metadata{T2, N, C, F}( + function MetadataV2{T2, N, C, F}( zarr_format, node_type, shape, @@ -135,7 +134,7 @@ struct Metadata{T, N, C, F, S} filters, dimension_separator::Char = '.' ) where {T2,N,C,F} - return Metadata{T2, N, C, F, dimension_separator}( + return MetadataV2{T2, N, C, F, dimension_separator}( zarr_format, node_type, shape, @@ -143,13 +142,37 @@ struct Metadata{T, N, C, F, S} dtype, compressor, fill_value, - order + order, + filters ) end +end +"""Metadata for Zarr version 3 arrays""" +struct MetadataV3{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} + zarr_format::Int + node_type::String + shape::Base.RefValue{NTuple{N, Int}} + chunks::NTuple{N, Int} + dtype::String # data_type in v3 + compressor::C + fill_value::Union{T, Nothing} + order::Char + filters::F # not yet supported + function MetadataV3{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + zarr_format == 3 || throw(ArgumentError("MetadataV3 only functions if zarr_format == 3")) + #Do some sanity checks to make sure we have a sane array + any(<(0), shape) && throw(ArgumentError("Size must be positive")) + any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) + order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) + new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + end end -const DimensionSeparatedMetadata{S} = Metadata{<: Any, <: Any, <: Any, <: Any, S} +# Type alias for backward compatibility +const Metadata = AbstractMetadata + +const DimensionSeparatedMetadata{S} = AbstractMetadata{<: Any, <: Any, <: Any, <: Any, S} function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where S if name == :dimension_separator @@ -157,11 +180,11 @@ function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where end return getfield(m, name) end -Base.propertynames(m::Metadata) = (fieldnames(Metadata)..., :dimension_separator) +Base.propertynames(m::AbstractMetadata) = (fieldnames(typeof(m))..., :dimension_separator) #To make unit tests pass with ref shape import Base.== -function ==(m1::Metadata, m2::Metadata) +function ==(m1::AbstractMetadata, m2::AbstractMetadata) m1.zarr_format == m2.zarr_format && m1.node_type == m2.node_type && m1.shape[] == m2.shape[] && @@ -187,17 +210,34 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; dimension_separator::Char = '.' ) where {T, N, C} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - node_type, - size(A), - chunks, - typestr(eltype(A)), - compressor, - fill_value, - order, - filters - ) + if zarr_format == 2 + MetadataV2{T2, N, C, typeof(filters), dimension_separator}( + zarr_format, + node_type, + size(A), + chunks, + typestr(eltype(A)), + compressor, + fill_value, + order, + filters + ) + elseif zarr_format == 3 + @warn("Zarr v3 support is experimental") + MetadataV3{T2, N, C, typeof(filters), dimension_separator}( + zarr_format, + node_type, + size(A), + chunks, + typestr(eltype(A)), + compressor, + fill_value, + order, + filters + ) + else + throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + end end Metadata(s::Union{AbstractString, IO},fill_as_missing) = Metadata(JSON.parse(s),fill_as_missing) @@ -235,7 +275,7 @@ function Metadata(d::AbstractDict, fill_as_missing) S = only(get(d, "dimension_separator", '.')) - Metadata{TU, N, C, F, S}( + MetadataV2{TU, N, C, F, S}( d["zarr_format"], node_type, NTuple{N, Int}(d["shape"]) |> reverse, @@ -249,10 +289,7 @@ function Metadata(d::AbstractDict, fill_as_missing) end "Describes how to lower Metadata to JSON, used in json(::Metadata)" -function JSON.lower(md::Metadata) - if md.zarr_format == 3 - return lower3(md) - end +function JSON.lower(md::MetadataV2) Dict{String, Any}( "zarr_format" => md.zarr_format, "node_type" => md.node_type, @@ -267,6 +304,10 @@ function JSON.lower(md::Metadata) ) end +function JSON.lower(md::MetadataV3) + return lower3(md) +end + # Fill value encoding and decoding as described in # https://zarr.readthedocs.io/en/stable/spec/v2.html#fill-value-encoding @@ -282,7 +323,7 @@ function fill_value_encoding(v::AbstractFloat) end end -Base.eltype(::Metadata{T}) where T = T +Base.eltype(::AbstractMetadata{T}) where T = T # this correctly parses "NaN" and "Infinity" fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) diff --git a/src/metadata3.jl b/src/metadata3.jl index 1678927b..93913d8b 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -64,7 +64,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) end end - return Metadata{Int,0,Nothing,Nothing,'/'}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing) + return MetadataV3{Int,0,Nothing,Nothing,'/'}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing) end # Array keys @@ -102,7 +102,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) if chunk_grid["name"] == "regular" chunks = Int.(chunk_grid["configuration"]["chunk_shape"]) if length(shape) != length(chunks) - throw(ArgumentError("Shape has rank $(length(shape)) which does not match the chunk_shape rank of $(length(chunk_shape))")) + throw(ArgumentError("Shape has rank $(length(shape)) which does not match the chunk_shape rank of $(length(chunks))")) end else throw(ArgumentError("Unknown chunk_grid of name, $(chunk_grid["name"])")) @@ -124,12 +124,12 @@ function Metadata3(d::AbstractDict, fill_as_missing) default_dim_perm = Tuple(1:length(shape)) dim_perm = default_dim_perm - codec_data_type = :array + codec_data_type = Ref(:array) - function check_codec_data_type(from, to) - codec_data_type == from || - throw(ArgumentError("$codec_name found by codec_data_type is $codec_data_type")) - codec_data_type = to + function check_codec_data_type(codec_name, from, to) + codec_data_type[] == from || + throw(ArgumentError("$codec_name found by codec_data_type is $(codec_data_type[])")) + codec_data_type[] = to return nothing end @@ -137,26 +137,26 @@ function Metadata3(d::AbstractDict, fill_as_missing) codec_name = codec["name"] if codec_name == "bytes" # array -> bytes - check_codec_data_type(:array, :bytes) + check_codec_data_type(codec_name, :array, :bytes) if haskey(codec, "configuration") codec["configuration"]["endian"] == "little" || throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) end elseif codec_name == "zstd" # bytes -> bytes - check_codec_data_type(:bytes, :bytes) + check_codec_data_type(codec_name, :bytes, :bytes) compdict = codec elseif codec_name == "blosc" # bytes -> bytes - check_codec_data_type(:bytes, :bytes) + check_codec_data_type(codec_name, :bytes, :bytes) compdict = codec elseif codec_name == "gzip" # bytes -> bytes - check_codec_data_type(:bytes, :bytes) + check_codec_data_type(codec_name, :bytes, :bytes) compdict = codec elseif codec_name == "transpose" # array -> array - check_codec_data_type(:array, :array) + check_codec_data_type(codec_name, :array, :array) _dim_order = codec["configuration"]["order"] if _dim_order == "C" @warn "Transpose codec dimension order of $_dim_order is deprecated" @@ -170,11 +170,11 @@ function Metadata3(d::AbstractDict, fill_as_missing) dim_perm = dim_perm[_dim_order] elseif codec_name == "sharding_indexed" # array -> bytes - check_codec_data_type(:array, :bytes) + check_codec_data_type(codec_name, :array, :bytes) throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) elseif codec_name == "crc32c" # bytes -> bytes - check_codec_data_type(:bytes, :bytes) + check_codec_data_type(codec_name, :bytes, :bytes) throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) else throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) @@ -192,7 +192,8 @@ function Metadata3(d::AbstractDict, fill_as_missing) compressor = getCompressor(compdict) # Filters (NOT IMPLEMENTED) - filters = getfilters(d) + # For v3, filters are not yet implemented, so we return nothing + filters = nothing # Type Parameters T = typestr3(data_type) @@ -215,7 +216,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) S = only(get(cke_configuration, "separator", '/')) end - Metadata{TU, N, C, F, S}( + MetadataV3{TU, N, C, F, S}( zarr_format, node_type, NTuple{N, Int}(shape) |> reverse, @@ -228,8 +229,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) ) end -function lower3(md::Metadata{T}) where T - md.zarr_format == 3 || throw(ArgumentError("lower3 only applies when zarr_format is 3")) +function lower3(md::MetadataV3{T}) where T mandatory_keys = [ "zarr_format", From db8a08c01a4c985bd0ac441c23ae00760be3ae57 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 18:16:18 +0100 Subject: [PATCH 31/39] dispatch --- src/ZArray.jl | 47 +++++++--------------- src/metadata.jl | 102 ++++++++++++++++++++++++++++++++--------------- src/metadata3.jl | 28 +++++++++++++ 3 files changed, 112 insertions(+), 65 deletions(-) diff --git a/src/ZArray.jl b/src/ZArray.jl index 45c0eb98..19f2dcad 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -361,38 +361,21 @@ function zcreate(::Type{T},storage::AbstractStore, length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) C = typeof(compressor) - if fill_value === nothing && zarr_format == 3 - fill_value = zero(T) - end - T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - metadata = if zarr_format == 2 - MetadataV2{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - "array", - dims, - chunks, - typestr(T), - compressor, - fill_value, - 'C', - filters, - ) - elseif zarr_format == 3 - @warn("Zarr v3 support is experimental") - MetadataV3{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - "array", - dims, - chunks, - typestr(T), - compressor, - fill_value, - 'C', - filters, - ) - else - throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) - end + + # Create a dummy array to use with Metadata constructor + # This allows us to leverage the multiple dispatch in Metadata constructors + dummy_array = Array{T,N}(undef, dims...) + metadata = Metadata(dummy_array, chunks; + zarr_format=zarr_format, + compressor=compressor, + fill_value=fill_value, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator + ) + + # Extract the element type from the metadata (handles T2 calculation) + T2 = eltype(metadata) isemptysub(storage,path) || error("$storage $path is not empty") diff --git a/src/metadata.jl b/src/metadata.jl index 81debcca..095fc323 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -205,51 +205,82 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', - filters::Nothing=nothing, + filters=nothing, fill_as_missing = false, dimension_separator::Char = '.' ) where {T, N, C} + return Metadata(A, chunks, Val(zarr_format); + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator + ) +end + +# V2 constructor +function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{2}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '.' + ) where {T, N, C, F} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - if zarr_format == 2 - MetadataV2{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - node_type, - size(A), - chunks, - typestr(eltype(A)), - compressor, - fill_value, - order, - filters - ) - elseif zarr_format == 3 - @warn("Zarr v3 support is experimental") - MetadataV3{T2, N, C, typeof(filters), dimension_separator}( - zarr_format, - node_type, - size(A), - chunks, - typestr(eltype(A)), - compressor, - fill_value, - order, - filters - ) - else - throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) - end + MetadataV2{T2, N, C, typeof(filters), dimension_separator}( + 2, + node_type, + size(A), + chunks, + typestr(eltype(A)), + compressor, + fill_value, + order, + filters + ) +end + +# V3 constructor - delegate to metadata3.jl +function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{3}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '.' + ) where {T, N, C, F} + return Metadata3(A, chunks; + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator + ) end Metadata(s::Union{AbstractString, IO},fill_as_missing) = Metadata(JSON.parse(s),fill_as_missing) "Construct Metadata from Dict" function Metadata(d::AbstractDict, fill_as_missing) - # create a Metadata struct from it - - if d["zarr_format"] == 3 - return Metadata3(d, fill_as_missing) + zarr_format = d["zarr_format"]::Int + if zarr_format == 2 + return Metadata(d, fill_as_missing, Val(2)) + elseif zarr_format == 3 + return Metadata(d, fill_as_missing, Val(3)) + else + throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) end +end +# V2 constructor from Dict +function Metadata(d::AbstractDict, fill_as_missing, ::Val{2}) # Zarr v2 metadata is only for arrays node_type = "array" @@ -288,6 +319,11 @@ function Metadata(d::AbstractDict, fill_as_missing) ) end +# V3 constructor from Dict - delegate to metadata3.jl +function Metadata(d::AbstractDict, fill_as_missing, ::Val{3}) + return Metadata3(d, fill_as_missing) +end + "Describes how to lower Metadata to JSON, used in json(::Metadata)" function JSON.lower(md::MetadataV2) Dict{String, Any}( diff --git a/src/metadata3.jl b/src/metadata3.jl index 93913d8b..cef65576 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -229,6 +229,34 @@ function Metadata3(d::AbstractDict, fill_as_missing) ) end +"Construct MetadataV3 based on your data" +function Metadata3(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '/' + ) where {T, N, C, F} + @warn("Zarr v3 support is experimental") + T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} + if fill_value === nothing + fill_value = zero(T) + end + MetadataV3{T2, N, C, typeof(filters), dimension_separator}( + 3, + node_type, + size(A), + chunks, + typestr3(eltype(A)), + compressor, + fill_value, + order, + filters + ) +end + function lower3(md::MetadataV3{T}) where T mandatory_keys = [ From 136470e25749a6bf8fbc60805252a41995073445 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 19:01:19 +0100 Subject: [PATCH 32/39] fix tests --- docs/src/tutorial.md | 2 +- test/Filters.jl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index ee705650..b42c2020 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -197,7 +197,7 @@ Order : C Read-Only : false Compressor : Zarr.BloscCompressor(0, 3, "zstd", 1) Filters : nothing -Store type : Zarr.VersionedStore{2, '.', Zarr.DictStore}(Dictionary Storage) +Store type : Zarr.FormattedStore{2, '.', Zarr.DictStore}(Dictionary Storage) No. bytes : 400000000 No. bytes stored : 2412289 Storage ratio : 165.81761140559857 diff --git a/test/Filters.jl b/test/Filters.jl index f46cf4ad..5c09a8ca 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -27,7 +27,8 @@ using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFil data = rand(100) enc = zencode(data, Fletcher32Filter()) - enc[begin] += 1 + # Corrupt the checksum by modifying a byte (handle overflow safely) + enc[begin] = UInt8((enc[begin] + 1) % 256) @test_throws "Checksum mismatch in Fletcher32 decoding" zdecode(enc, Fletcher32Filter()) end From 616f5638cdea710408e5373d7f28d60d9287f3d6 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 22:03:40 +0100 Subject: [PATCH 33/39] py v3 baseline --- test/v3_julia.jl | 1 + test/v3_python.jl | 479 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 480 insertions(+) create mode 100644 test/v3_julia.jl create mode 100644 test/v3_python.jl diff --git a/test/v3_julia.jl b/test/v3_julia.jl new file mode 100644 index 00000000..5b6aeb7b --- /dev/null +++ b/test/v3_julia.jl @@ -0,0 +1 @@ +using Zarr \ No newline at end of file diff --git a/test/v3_python.jl b/test/v3_python.jl new file mode 100644 index 00000000..247d1815 --- /dev/null +++ b/test/v3_python.jl @@ -0,0 +1,479 @@ +# Julia script to generate Zarr v3 fixtures using PythonCall + CondaPkg +# Adapted from: https://github.com/manzt/zarrita.js/blob/23abb3bee9094aabbe60985626caef2802360963/scripts/generate-v3.py + +using CondaPkg +using JSON + +# Install Python deps into Conda env used by PythonCall (zarr v3 and numpy) +CondaPkg.add("numpy") +CondaPkg.add("zarr"; version="3.*") +CondaPkg.add("numcodecs") + +using PythonCall +# Import Python modules +np = pyimport("numpy") +zarr = pyimport("zarr") +codecs = pyimport("zarr.codecs") +storage = pyimport("zarr.storage") +json = pyimport("json") +shutil = pyimport("shutil") +pathlib = pyimport("pathlib") +builtins = pyimport("builtins") + +# Paths +path_v3 = joinpath(@__DIR__, "v3_python", "data.zarr") + +# deterministic RNG for numpy +np.random.seed(42) + +# remove existing +try + shutil.rmtree(path_v3) +catch + # ignore +end + +# create store and path_v3 group +store = storage.LocalStore(path_v3) +zarr.create_group(store) + +# helper: create array and set data (value should be a numpy array or convertible) +function create_and_fill(store; name, dtype=nothing, shape=nothing, chunks=nothing, + serializer=nothing, compressors=nothing, filters=nothing, shards=nothing, data) + # Build NamedTuple of only non-nothing keyword arguments + kwargs = (; name=name) + if dtype !== nothing + kwargs = merge(kwargs, (; dtype=dtype)) + end + if shape !== nothing + kwargs = merge(kwargs, (; shape=shape)) + end + if chunks !== nothing + kwargs = merge(kwargs, (; chunks=chunks)) + end + if serializer !== nothing + kwargs = merge(kwargs, (; serializer=serializer)) + end + if compressors !== nothing + kwargs = merge(kwargs, (; compressors=compressors)) + end + if filters !== nothing + kwargs = merge(kwargs, (; filters=filters)) + end + if shards !== nothing + kwargs = merge(kwargs, (; shards=shards)) + end + + # create the array + a = zarr.create_array(store; kwargs...) + + # ensure numpy array + arr = data isa Py ? data : np.array(data) + + # assign content + a.__setitem__(builtins.Ellipsis, arr) + + return a +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store; + name="1d.contiguous.gzip.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=[1,2,3,4], +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store; + name="1d.contiguous.blosc.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.raw.i2 +create_and_fill(store; + name="1d.contiguous.raw.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=nothing, + data=[1,2,3,4], +) + +# 1d.contiguous.i4 +create_and_fill(store; + name="1d.contiguous.i4", + dtype="int32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.u1 +create_and_fill(store; + name="1d.contiguous.u1", + dtype="uint8", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([255,0,255,0], dtype="u1") +) + +# 1d.contiguous.f2.le +create_and_fill(store; + name="1d.contiguous.f2.le", + dtype="float16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f2"), +) + +# 1d.contiguous.f4.le +create_and_fill(store; + name="1d.contiguous.f4.le", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f4.be +create_and_fill(store; + name="1d.contiguous.f4.be", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="big"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f8 +create_and_fill(store; + name="1d.contiguous.f8", + dtype="float64", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.b1 +create_and_fill(store; + name="1d.contiguous.b1", + dtype="bool", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.i2 +create_and_fill(store; + name="1d.chunked.i2", + dtype="int16", + shape=(4,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4], dtype="i2"), +) + +# adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store; + name="1d.chunked.ragged.i2", + dtype="int16", + shape=(5,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4,5], dtype="i2"), +) + +# 2d.contiguous.i2 +create_and_fill(store; + name="2d.contiguous.i2", + dtype="int16", + shape=(2,2), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data= np.array([ [1,2], [3,4] ] |> pylist, dtype="i2"), +) + +# 2d.chunked.i2 +create_and_fill(store; + name="2d.chunked.i2", + dtype="int16", + shape=(2,2), + chunks=(1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2],[3,4]] |> pylist, dtype="i2"), +) + +# 2d.chunked.ragged.i2 +create_and_fill(store; + name="2d.chunked.ragged.i2", + dtype="int16", + shape=(3,3), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2,3],[4,5,6],[7,8,9]] |> pylist, dtype="i2"), +) + +# 3d.contiguous.i2 +create_and_fill(store; + name="3d.contiguous.i2", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.i2 +create_and_fill(store; + name="3d.chunked.i2", + dtype="int16", + shape=(3,3,3), + chunks=(1,1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store; + name="3d.chunked.mixed.i2.C", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.F (with transpose filter to simulate column-major) +transpose_filter = codecs.TransposeCodec(order=[2,1,0]) +create_and_fill(store; + name="3d.chunked.mixed.i2.F", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + filters=[transpose_filter], + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +##### Sharded/compressed examples +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i4", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i4"), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.u1", + shape=(4,), + dtype=np.array([255,0,255,0], dtype="u1").dtype, + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([255,0,255,0], dtype="u1"), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f4", + shape=(4,), + dtype=np.array([-1000.5,0,1000.5,0], dtype="f4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([-1000.5,0,1000.5,0], dtype="f4"), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f8", + shape=(4,), + dtype=np.array([1.5,2.5,3.5,4.5], dtype="f8").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.b1", + shape=(4,), + dtype="bool", + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.filled.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,0,0], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,0,0], dtype="i2"), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="2d.contiguous.compressed.sharded.i2", + shape=(2,2), + dtype=np.arange(1,5, dtype="i2").dtype, + chunks=(2,2), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,5, dtype="i2").reshape(2,2), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.filled.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(16, dtype="i2").reshape(4,4), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=(np.arange(16, dtype="i2").reshape(4,4) + 1), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.ragged.compressed.sharded.i2", + shape=(3,3), + dtype=np.arange(1,10, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,10, dtype="i2").reshape(3,3), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="3d.contiguous.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,3), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.compressed.sharded.i2", + shape=(4,4,4), + dtype=np.arange(64, dtype="i2").dtype, + chunks=(1,1,1), + shards=(2,2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(64, dtype="i2").reshape(4,4,4), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.mixed.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,1), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# Group with spaces in the name +g = zarr.create_group(store, path="my group with spaces") +g.attrs["description"] = "A group with spaces in the name" + +@info "Zarr v3 fixtures generated at: $path_v3" From df7cbf4734c96180c0efe06fa21c08a8f7c24d14 Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Sun, 16 Nov 2025 23:20:19 +0100 Subject: [PATCH 34/39] julia version --- src/Compressors/v3.jl | 2 +- src/metadata3.jl | 6 +- test/v3_julia.jl | 309 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 313 insertions(+), 4 deletions(-) diff --git a/src/Compressors/v3.jl b/src/Compressors/v3.jl index 955c116a..fa8c1ef2 100644 --- a/src/Compressors/v3.jl +++ b/src/Compressors/v3.jl @@ -41,7 +41,7 @@ function JSON.lower(c::Compressor_v3{ZlibCompressor}) return Dict( "name" => "gzip", "configuration" => Dict( - "level" => p.clevel + "level" => p.config.level ) ) end diff --git a/src/metadata3.jl b/src/metadata3.jl index cef65576..374cc5d5 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -317,8 +317,10 @@ function lower3(md::MetadataV3{T}) where T ) ) ) - # Compress bytes to bytes - push!(codecs, JSON.lower(Compressor_v3(md.compressor))) + # Compress bytes to bytes (only if not NoCompressor) + if !(md.compressor isa NoCompressor) + push!(codecs, JSON.lower(Compressor_v3(md.compressor))) + end Dict{String, Any}( "zarr_format" => md.zarr_format, diff --git a/test/v3_julia.jl b/test/v3_julia.jl index 5b6aeb7b..ec735021 100644 --- a/test/v3_julia.jl +++ b/test/v3_julia.jl @@ -1 +1,308 @@ -using Zarr \ No newline at end of file +# Julia script to generate Zarr v3 fixtures using pure Julia +# Mirrors the examples from v3_python.jl + +using Zarr +using JSON + +# Paths +path_v3 = joinpath(@__DIR__, "v3_julia", "data.zarr") + +# Remove existing +if isdir(path_v3) + rm(path_v3, recursive=true) +end + +# Create store and root group for v3 +store = Zarr.FormattedStore{3, '/'}(Zarr.DirectoryStore(path_v3)) +# Manually create v3 group metadata (zgroup defaults to v2) # TODO: we need to fix this! +group_meta = Dict("zarr_format" => 3, "node_type" => "group") +b = IOBuffer() +JSON.print(b, group_meta) +store["", "zarr.json"] = take!(b) + +# Helper: create array and set data +function create_and_fill(store, name, data; + dtype=nothing, + shape=nothing, + chunks=nothing, + compressor=Zarr.BloscCompressor(), + fill_value=nothing, + zarr_format=3, + dimension_separator='/') + + # Create the array + z = zcreate(eltype(data), store, shape...; + path=name, + chunks=chunks, + compressor=compressor, + fill_value=fill_value, + zarr_format=zarr_format, + dimension_separator=dimension_separator) + # Fill the array with the data + z[:] = data + return z +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store, "1d.contiguous.gzip.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store, "1d.contiguous.blosc.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.raw.i2 +create_and_fill(store, "1d.contiguous.raw.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.NoCompressor(), +) + +# 1d.contiguous.i4 +create_and_fill(store, "1d.contiguous.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.u1 +create_and_fill(store, "1d.contiguous.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f2.le +create_and_fill(store, "1d.contiguous.f2.le", Float16[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.le +create_and_fill(store, "1d.contiguous.f4.le", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.be +# Note: Big endian is not directly supported in Julia, but we can create the array +# The actual endianness is handled by the bytes codec in v3 +create_and_fill(store, "1d.contiguous.f4.be", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f8 +create_and_fill(store, "1d.contiguous.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.b1 +create_and_fill(store, "1d.contiguous.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.chunked.i2 +z = create_and_fill(store, "1d.chunked.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# Adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store, "1d.chunked.ragged.i2", Int16[1,2,3,4,5]; + shape=(5,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.contiguous.i2 +create_and_fill(store, "2d.contiguous.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.i2 +create_and_fill(store, "2d.chunked.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.ragged.i2 +create_and_fill(store, "2d.chunked.ragged.i2", Int16[1 2 3; 4 5 6; 7 8 9]; + shape=(3,3), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.contiguous.i2 +create_and_fill(store, "3d.contiguous.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.i2 +create_and_fill(store, "3d.chunked.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(1,1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store, "3d.chunked.mixed.i2.C", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.F +# Note: Column-major order (F) is simulated with transpose filter in Python +# In Julia, we create with C order as that's what's currently supported +create_and_fill(store, "3d.chunked.mixed.i2.F", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +##### Sharded/compressed examples +# Note: Sharding is not yet fully implemented in Zarr.jl, so these examples +# may not produce the exact same structure as the Python version. +# They are included for completeness but may need adjustment once sharding is supported. + +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store, "1d.contiguous.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store, "1d.contiguous.compressed.sharded.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store, "1d.contiguous.compressed.sharded.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store, "1d.contiguous.compressed.sharded.f4", Float32[-1000.5,0,1000.5,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store, "1d.contiguous.compressed.sharded.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store, "1d.contiguous.compressed.sharded.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.filled.compressed.sharded.i2", Int16[1,2,0,0]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store, "2d.contiguous.compressed.sharded.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.filled.i2", reshape(Int16.(0:15), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.i2", reshape(Int16.(1:16), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.ragged.compressed.sharded.i2", reshape(Int16.(1:9), 3, 3); + shape=(3,3), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store, "3d.contiguous.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.compressed.sharded.i2", reshape(Int16.(0:63), 4, 4, 4); + shape=(4,4,4), + chunks=(1,1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.mixed.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.ZlibCompressor(), +) + +# Group with spaces in the name +group_path = "my group with spaces" +group_meta2 = Dict("zarr_format" => 3, "node_type" => "group", "attributes" => Dict("description" => "A group with spaces in the name")) +b2 = IOBuffer() +JSON.print(b2, group_meta2) +store[group_path, "zarr.json"] = take!(b2) + +@info "Zarr v3 fixtures generated at: $path_v3" \ No newline at end of file From f36910632c469e130f245eb6e2850b07ef21202b Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Mon, 17 Nov 2025 07:37:25 +0100 Subject: [PATCH 35/39] claude's sharding version, debug, integrate now --- src/Codecs/V3/V3.jl | 402 +++++++++++++++++++++++++++++++++++++++++++- src/metadata3.jl | 4 +- 2 files changed, 401 insertions(+), 5 deletions(-) diff --git a/src/Codecs/V3/V3.jl b/src/Codecs/V3/V3.jl index 710946c3..89a0405f 100644 --- a/src/Codecs/V3/V3.jl +++ b/src/Codecs/V3/V3.jl @@ -2,6 +2,7 @@ module V3Codecs import ..Codecs: zencode, zdecode, zencode!, zdecode! using CRC32c: CRC32c +using JSON: JSON abstract type V3Codec{In,Out} end const codectypes = Dict{String, V3Codec}() @@ -87,14 +88,407 @@ function zdecode!(output::IOBuffer, input::IOBuffer, c::CRC32cCodec) return output end +""" + ShardingCodec{N} + +Sharding codec for Zarr v3. Sharding splits chunks into smaller "shards" and stores them +in a single file with an index mapping chunk coordinates to shard locations. + +# Fields +- `chunk_shape`: Shape of each shard (NTuple{N,Int}) +- `codecs`: Vector of codecs to apply to shard data (e.g., [BytesCodec(), GzipCodec()]) +- `index_codecs`: Vector of codecs to apply to the index (e.g., [BytesCodec()]) +- `index_location`: Location of index in shard file, either `:start` or `:end` + +# Implementation Notes +Sharding works by: +1. Taking a chunk of data and splitting it into shards based on `chunk_shape` +2. Encoding each shard using the `codecs` pipeline +3. Creating an index that maps (chunk_coords, shard_coords) -> (offset, size) in the shard file +4. Encoding the index using `index_codecs` +5. Writing the shard file with index at `index_location` (start or end) + +""" struct ShardingCodec{N} <: V3Codec{:array, :bytes} - chunk_shape::NTuple{N,Int} - codecs::Vector{V3Codec} - index_codecs::Vector{V3Codec} - index_location::Symbol + chunk_shape::NTuple{N,Int} # Shape of each shard + codecs::Vector{V3Codec} # Codecs to apply to shard data + index_codecs::Vector{V3Codec} # Codecs to apply to the index + index_location::Symbol # :start or :end end name(::ShardingCodec) = "sharding_indexed" +""" + JSON.lower(c::ShardingCodec) + +Serialize ShardingCodec to JSON format for Zarr v3 metadata. +""" +function JSON.lower(c::ShardingCodec) + return Dict( + "name" => "sharding_indexed", + "configuration" => Dict( + "chunk_shape" => collect(c.chunk_shape), + "codecs" => [JSON.lower(codec) for codec in c.codecs], + "index_codecs" => [JSON.lower(codec) for codec in c.index_codecs], + "index_location" => string(c.index_location) + ) + ) +end + +""" + getCodec(::Type{ShardingCodec}, d::Dict) + +Deserialize ShardingCodec from JSON configuration dict. +""" +function getCodec(::Type{ShardingCodec}, d::Dict) + config = d["configuration"] + N = length(config["chunk_shape"]) + chunk_shape = NTuple{N,Int}(config["chunk_shape"]) + codecs = [getCodec(codec_dict) for codec_dict in config["codecs"]] + index_codecs = [getCodec(codec_dict) for codec_dict in config["index_codecs"]] + index_location = Symbol(get(config, "index_location", "end")) + return ShardingCodec{N}(chunk_shape, codecs, index_codecs, index_location) +end + +const MAX_UINT64 = typemax(UInt64) + +""" + ShardIndex{N} + +Internal structure representing the shard index. +Shape: (chunks_per_shard..., 2) where last dimension is [offset, nbytes] +Empty chunks are marked with (MAX_UINT64, MAX_UINT64) +""" +struct ShardIndex{N} + offsets_and_lengths::Array{UInt64, N} # Shape: (chunks_per_shard..., 2) +end + +""" + ShardIndex(chunks_per_shard::NTuple{N,Int}) + +Create an empty shard index with all chunks marked as empty. +""" +function ShardIndex(chunks_per_shard::NTuple{N,Int}) where N + arr = fill(MAX_UINT64, (chunks_per_shard..., 2)) + return ShardIndex{N+1}(arr) +end + +""" + get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Get the byte range (offset, offset+nbytes) for a chunk, or nothing if empty. +""" +function get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + offset = idx.offsets_and_lengths[chunk_coords..., 1] + nbytes = idx.offsets_and_lengths[chunk_coords..., 2] + + if offset == MAX_UINT64 && nbytes == MAX_UINT64 + return nothing + end + + return (Int(offset), Int(offset + nbytes)) +end + +""" + set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) + +Set the byte range for a chunk in the index. +""" +function set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) where N + idx.offsets_and_lengths[chunk_coords..., 1] = UInt64(offset) + idx.offsets_and_lengths[chunk_coords..., 2] = UInt64(nbytes) +end + +""" + set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Mark a chunk as empty in the index. +""" +function set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + idx.offsets_and_lengths[chunk_coords..., 1] = MAX_UINT64 + idx.offsets_and_lengths[chunk_coords..., 2] = MAX_UINT64 +end + +""" + calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) + +Calculate how many chunks fit in each shard dimension. +""" +function calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) where N + return ntuple(i -> div(shard_shape[i], chunk_shape[i]), N) +end + +""" + get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, + shard_shape::NTuple{N,Int}) + +Get the array slice ranges for a chunk within a shard. +chunk_coords are 1-based indices. +""" +function get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, + shard_shape::NTuple{N,Int}) where N + return ntuple(N) do i + start_idx = (chunk_coords[i] - 1) * chunk_shape[i] + 1 + end_idx = min(chunk_coords[i] * chunk_shape[i], shard_shape[i]) + start_idx:end_idx + end +end + +""" + apply_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in forward order (encoding). +""" +function apply_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in codecs + result = zencode(result, codec) + end + return result +end + +""" + reverse_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in reverse order (decoding). +""" +function reverse_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in reverse(codecs) + result = zdecode(result, codec) + end + return result +end + +""" + encode_shard_index(index::ShardIndex, index_codecs::Vector{V3Codec}) + +Encode the shard index using the index codec pipeline. +Per spec: "The index is encoded into binary representations using the specified index codecs." +""" +function encode_shard_index(index::ShardIndex{N}, index_codecs::Vector{V3Codec}) where N + # Index array is stored in C order (row-major) + # Convert to bytes: the index is an array of UInt64 values + index_bytes = reinterpret(UInt8, vec(index.offsets_and_lengths)) + + # Apply index codecs + encoded = apply_codec_chain(index_bytes, index_codecs) + + return encoded +end + +""" + decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, + index_codecs::Vector{V3Codec}) + +Decode the shard index from bytes. +""" +function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, + index_codecs::Vector{V3Codec}) where N + # Decode using index codecs (in reverse order) + decoded_bytes = reverse_codec_chain(index_bytes, index_codecs) + + # Expected size: 16 bytes (2 * UInt64) per chunk + n_chunks = prod(chunks_per_shard) + expected_length = n_chunks * 2 * sizeof(UInt64) + + if length(decoded_bytes) != expected_length + throw(DimensionMismatch("Index size mismatch: expected $expected_length, got $(length(decoded_bytes))")) + end + + # Reshape to index array: (chunks_per_shard..., 2) + index_array = reshape(reinterpret(UInt64, decoded_bytes), (chunks_per_shard..., 2)) + + return ShardIndex{N+1}(index_array) +end + +""" + compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) + +Compute the byte size of the encoded shard index. +Per spec: "The size of the index can be determined by applying c.compute_encoded_size +for each index codec recursively. The initial size is the byte size of the index array, +i.e. 16 * chunks per shard." +""" +function compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N + # Initial size: 16 bytes per chunk (2 * UInt64) + n_chunks = prod(chunks_per_shard) + size = n_chunks * 16 + + # Apply each codec's size transformation + # For most codecs, we need to actually encode to know the size + # For simplicity, we encode an empty index + index = ShardIndex(chunks_per_shard) + encoded = encode_shard_index(index, index_codecs) + + return length(encoded) +end + +""" + zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec) + +Encode array data using sharding codec following Zarr v3 spec. + +Per spec: "In the sharding_indexed binary format, inner chunks are written successively +in a shard, where unused space between them is allowed, followed by an index referencing them." +""" +function zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec{N}) where N + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Create empty index + index = ShardIndex(chunks_per_shard) + + # Buffers for encoded chunks + chunk_buffers = Vector{UInt8}[] + current_offset = 0 + + # Process chunks in C order (row-major) + # Per spec: "The actual order of the chunk content is not fixed" + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Extract chunk data from shard + slice_ranges = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + chunk_data = data[slice_ranges...] + + # Encode chunk using codec pipeline + encoded_chunk = apply_codec_chain(chunk_data, c.codecs) + + # Skip if chunk is empty (no bytes) + if isempty(encoded_chunk) + set_chunk_empty!(index, chunk_coords) + continue + end + + nbytes = length(encoded_chunk) + + # Record offset and length in index + set_chunk_slice!(index, chunk_coords, current_offset, nbytes) + + push!(chunk_buffers, encoded_chunk) + current_offset += nbytes + end + + # Encode the index + encoded_index = encode_shard_index(index, c.index_codecs) + index_size = length(encoded_index) + + # If index is at start, adjust all offsets to account for index size + if c.index_location == :start + # Add index_size to all non-empty chunk offsets + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + offset = index.offsets_and_lengths[chunk_coords..., 1] + if offset != MAX_UINT64 + index.offsets_and_lengths[chunk_coords..., 1] = offset + index_size + end + end + # Re-encode index with corrected offsets + encoded_index = encode_shard_index(index, c.index_codecs) + end + + # If all chunks are empty, return empty buffer (no shard) + if isempty(chunk_buffers) + resize!(encoded, 0) + return encoded + end + + # Assemble final shard: [index] + chunks or chunks + [index] + total_size = (c.index_location == :start ? index_size : 0) + + current_offset + + (c.index_location == :end ? index_size : 0) + + resize!(encoded, total_size) + output = IOBuffer(encoded, write=true) + + if c.index_location == :start + write(output, encoded_index) + for buf in chunk_buffers + write(output, buf) + end + else # :end + for buf in chunk_buffers + write(output, buf) + end + write(output, encoded_index) + end + + return encoded +end + +""" + zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec) + +Decode sharded data back to array following Zarr v3 spec. + +Per spec: "A simple implementation to decode inner chunks in a shard would +(a) read the entire value from the store into a byte buffer, +(b) parse the shard index from the beginning or end of the buffer and +(c) cut out the relevant bytes that belong to the requested chunk." +""" +function zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec{N}) where N + # Handle empty shard (no data) + if isempty(encoded) + fill!(data, zero(eltype(data))) # Fill with zeros (or should use fill_value from spec) + return data + end + + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Compute encoded index size + index_size = compute_encoded_index_size(chunks_per_shard, c.index_codecs) + + # Extract index bytes based on location + if c.index_location == :start + index_bytes = encoded[1:index_size] + chunk_data_offset = index_size + else # :end + index_bytes = encoded[end-index_size+1:end] + chunk_data_offset = 0 + end + + # Decode the index + index = decode_shard_index(index_bytes, chunks_per_shard, c.index_codecs) + + # Decode each chunk and place into output array + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Get chunk byte range from index + chunk_slice = get_chunk_slice(index, chunk_coords) + + # Get array slice for this chunk + array_slice = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + + if chunk_slice === nothing + # Empty chunk - fill with zeros (or fill_value) + # Per spec: "Empty inner chunks are interpreted as being filled with the fill value" + data[array_slice...] .= zero(eltype(data)) + continue + end + + # Extract chunk bytes + # Offsets in index are relative to start of chunk data + offset_start, offset_end = chunk_slice + + # Adjust for where chunk data begins in the shard + byte_start = chunk_data_offset + offset_start + 1 # Julia 1-based indexing + byte_end = chunk_data_offset + offset_end + + encoded_chunk = encoded[byte_start:byte_end] + + # Decode chunk using codec pipeline (in reverse) + decoded_chunk = reverse_codec_chain(encoded_chunk, c.codecs) + + # Place decoded chunk into output array + expected_shape = length.(array_slice) + data[array_slice...] = reshape(decoded_chunk, expected_shape) + end + + return data +end + struct TransposeCodec <: V3Codec{:array, :array} end name(::TransposeCodec) = "transpose" diff --git a/src/metadata3.jl b/src/metadata3.jl index 374cc5d5..5a45c69e 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -171,7 +171,9 @@ function Metadata3(d::AbstractDict, fill_as_missing) elseif codec_name == "sharding_indexed" # array -> bytes check_codec_data_type(codec_name, :array, :bytes) - throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + # TODO: Implement sharding codec support + # See implementation suggestions in src/Codecs/V3/V3.jl for ShardingCodec + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec. See src/Codecs/V3/V3.jl for implementation suggestions.")) elseif codec_name == "crc32c" # bytes -> bytes check_codec_data_type(codec_name, :bytes, :bytes) From c3ba31e8ff5c88d586e9aeec524b01ae8e83035a Mon Sep 17 00:00:00 2001 From: Lazaro Alonso Date: Mon, 17 Nov 2025 09:33:07 +0100 Subject: [PATCH 36/39] offset nbytes order --- src/Codecs/V3/V3.jl | 96 ++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/src/Codecs/V3/V3.jl b/src/Codecs/V3/V3.jl index 89a0405f..2cf3c58a 100644 --- a/src/Codecs/V3/V3.jl +++ b/src/Codecs/V3/V3.jl @@ -151,15 +151,27 @@ end const MAX_UINT64 = typemax(UInt64) +""" + ChunkShardInfo + +Information about a chunk's location within a shard. +""" +struct ChunkShardInfo + offset::UInt64 # Byte offset within shard where chunk begins + nbytes::UInt64 # Number of bytes the chunk occupies +end + +ChunkShardInfo() = ChunkShardInfo(MAX_UINT64, MAX_UINT64) # Empty chunk marker + """ ShardIndex{N} Internal structure representing the shard index. -Shape: (chunks_per_shard..., 2) where last dimension is [offset, nbytes] -Empty chunks are marked with (MAX_UINT64, MAX_UINT64) +Stores chunk location info for an N-dimensional grid of chunks. +Empty chunks are marked with ChunkShardInfo(MAX_UINT64, MAX_UINT64) """ struct ShardIndex{N} - offsets_and_lengths::Array{UInt64, N} # Shape: (chunks_per_shard..., 2) + chunks::Array{ChunkShardInfo, N} # N-dimensional array of chunk info end """ @@ -168,8 +180,8 @@ end Create an empty shard index with all chunks marked as empty. """ function ShardIndex(chunks_per_shard::NTuple{N,Int}) where N - arr = fill(MAX_UINT64, (chunks_per_shard..., 2)) - return ShardIndex{N+1}(arr) + chunks = fill(ChunkShardInfo(), chunks_per_shard) + return ShardIndex{N}(chunks) end """ @@ -178,14 +190,13 @@ end Get the byte range (offset, offset+nbytes) for a chunk, or nothing if empty. """ function get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N - offset = idx.offsets_and_lengths[chunk_coords..., 1] - nbytes = idx.offsets_and_lengths[chunk_coords..., 2] + info = idx.chunks[chunk_coords...] - if offset == MAX_UINT64 && nbytes == MAX_UINT64 + if info.offset == MAX_UINT64 && info.nbytes == MAX_UINT64 return nothing end - return (Int(offset), Int(offset + nbytes)) + return (Int(info.offset), Int(info.offset + info.nbytes)) end """ @@ -194,8 +205,7 @@ end Set the byte range for a chunk in the index. """ function set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) where N - idx.offsets_and_lengths[chunk_coords..., 1] = UInt64(offset) - idx.offsets_and_lengths[chunk_coords..., 2] = UInt64(nbytes) + idx.chunks[chunk_coords...] = ChunkShardInfo(UInt64(offset), UInt64(nbytes)) end """ @@ -204,8 +214,7 @@ end Mark a chunk as empty in the index. """ function set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N - idx.offsets_and_lengths[chunk_coords..., 1] = MAX_UINT64 - idx.offsets_and_lengths[chunk_coords..., 2] = MAX_UINT64 + idx.chunks[chunk_coords...] = ChunkShardInfo() end """ @@ -218,14 +227,12 @@ function calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTu end """ - get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, - shard_shape::NTuple{N,Int}) + get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) Get the array slice ranges for a chunk within a shard. chunk_coords are 1-based indices. """ -function get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, - shard_shape::NTuple{N,Int}) where N +function get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) where N return ntuple(N) do i start_idx = (chunk_coords[i] - 1) * chunk_shape[i] + 1 end_idx = min(chunk_coords[i] * chunk_shape[i], shard_shape[i]) @@ -263,13 +270,28 @@ end encode_shard_index(index::ShardIndex, index_codecs::Vector{V3Codec}) Encode the shard index using the index codec pipeline. -Per spec: "The index is encoded into binary representations using the specified index codecs." + +Per Zarr v3 spec, the index is linearized in C-order (row-major) with alternating +offset/nbytes values: [chunk_0_offset, chunk_0_nbytes, chunk_1_offset, chunk_1_nbytes, ...] +``` """ function encode_shard_index(index::ShardIndex{N}, index_codecs::Vector{V3Codec}) where N - # Index array is stored in C order (row-major) - # Convert to bytes: the index is an array of UInt64 values - index_bytes = reinterpret(UInt8, vec(index.offsets_and_lengths)) + # Pre-allocate buffer for index data + n_chunks = length(index.chunks) + index_data = Vector{UInt64}(undef, 2 * n_chunks) + # Iterate in C-order (row-major) and interleave offset/nbytes + idx = 1 + for cart_idx in CartesianIndices(index.chunks) + info = index.chunks[cart_idx] + index_data[idx] = info.offset + index_data[idx + 1] = info.nbytes + idx += 2 + end + + # Convert to bytes + index_bytes = reinterpret(UInt8, index_data) + # Apply index codecs encoded = apply_codec_chain(index_bytes, index_codecs) @@ -277,13 +299,14 @@ function encode_shard_index(index::ShardIndex{N}, index_codecs::Vector{V3Codec}) end """ - decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, - index_codecs::Vector{V3Codec}) + decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) Decode the shard index from bytes. + +The bytes are in C-order with alternating offset/nbytes: +[offset0, nbytes0, offset1, nbytes1, ...] """ -function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, - index_codecs::Vector{V3Codec}) where N +function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N # Decode using index codecs (in reverse order) decoded_bytes = reverse_codec_chain(index_bytes, index_codecs) @@ -295,10 +318,21 @@ function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple throw(DimensionMismatch("Index size mismatch: expected $expected_length, got $(length(decoded_bytes))")) end - # Reshape to index array: (chunks_per_shard..., 2) - index_array = reshape(reinterpret(UInt64, decoded_bytes), (chunks_per_shard..., 2)) + # Reinterpret as UInt64 array: [offset1, nbytes1, offset1, nbytes1, ...] + index_data = reinterpret(UInt64, decoded_bytes) + + # Reconstruct the N-dimensional array of ChunkShardInfo + chunks = Array{ChunkShardInfo, N}(undef, chunks_per_shard) + + idx = 1 + for cart_idx in CartesianIndices(chunks) + offset = index_data[idx] + nbytes = index_data[idx + 1] + chunks[cart_idx] = ChunkShardInfo(offset, nbytes) + idx += 2 + end - return ShardIndex{N+1}(index_array) + return ShardIndex{N}(chunks) end """ @@ -378,9 +412,9 @@ function zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec{ # Add index_size to all non-empty chunk offsets for cart_idx in CartesianIndices(chunks_per_shard) chunk_coords = Tuple(cart_idx) - offset = index.offsets_and_lengths[chunk_coords..., 1] - if offset != MAX_UINT64 - index.offsets_and_lengths[chunk_coords..., 1] = offset + index_size + info = index.chunks[cart_idx] + if info.offset != MAX_UINT64 + index.chunks[cart_idx] = ChunkShardInfo(info.offset + index_size, info.nbytes) end end # Re-encode index with corrected offsets From f888a152ead3b3f4218f750303914467ef7fc2c9 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Thu, 27 Nov 2025 16:58:58 +0100 Subject: [PATCH 37/39] Move chunk encoding logic away from storage but into metadata --- src/Storage/Storage.jl | 157 +++++++++++++++++++++++++--------- src/Storage/formattedstore.jl | 104 +++++++++++----------- src/ZArray.jl | 71 +++++++-------- src/ZGroup.jl | 95 ++++++++++---------- src/Zarr.jl | 8 ++ src/metadata.jl | 100 ++++++++-------------- src/metadata3.jl | 16 ++-- test/runtests.jl | 21 ++--- 8 files changed, 310 insertions(+), 262 deletions(-) diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index b5816b21..a816728c 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -78,31 +78,53 @@ Returns the keys of files in the given store. """ function subkeys end +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' -""" - Base.delete!(d::AbstractStore, k::String) +default_sep(::ZarrFormat{2}) = DS2 +default_sep(::ZarrFormat{3}) = DS3 +default_prefix(::ZarrFormat{2}) = false +default_prefix(::ZarrFormat{3}) = true +const DS = default_sep(DV) + +ZarrFormat(s::AbstractStore, path) = is_zarr2(s, path) ? ZarrFormat(2) : + is_zarr3(s, path) ? ZarrFormat(3) : + throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup in a recognized zarr format.")) -Deletes the given key from the store. -""" -citostring(i::CartesianIndex) = join(reverse((i - oneunit(i)).I), '.') -citostring(::CartesianIndex{0}) = "0" +@inline function citostring(e::ChunkEncoding, i::CartesianIndex) + if e.prefix + "c$(e.sep)" * join(reverse((i - oneunit(i)).I), e.sep) + else + join(reverse((i - oneunit(i)).I), e.sep) + end +end +@inline citostring(e::ChunkEncoding, ::CartesianIndex{0}) = e.prefix ? "c$(e.sep)0" : "0" + _concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s -Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i)] -Base.getindex(s::AbstractStore, p, i) = s[_concatpath(p,i)] -Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i)) -Base.delete!(s::AbstractStore, p, i) = delete!(s, _concatpath(p,i)) -Base.haskey(s::AbstractStore, k) = isinitialized(s,k) -Base.setindex!(s::AbstractStore,v,p,i) = setindex!(s,v,_concatpath(p,i)) -Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i)]=v +# Function to read a chunk from store s +store_readchunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] +store_deletechunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = delete!(s, p, citostring(e, i)) +store_writechunk(s::AbstractStore, v, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] = v +store_isinitialized(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = isinitialized(s, p, citostring(e, i)) + + +#Functions to concat path and key +Base.getindex(s::AbstractStore, p, i::AbstractString) = s[_concatpath(p, i)] +Base.delete!(s::AbstractStore, p, i::AbstractString) = delete!(s, _concatpath(p, i)) +Base.haskey(s::AbstractStore, k::AbstractString) = isinitialized(s, k) +Base.setindex!(s::AbstractStore, v, p, i::AbstractString) = setindex!(s, v, _concatpath(p, i)) + maybecopy(x) = copy(x) maybecopy(x::String) = x -function getattrs(s::AbstractStore, p) +function getattrs(::ZarrFormat{2}, s::AbstractStore, p) atts = s[p,".zattrs"] if atts === nothing Dict() @@ -110,7 +132,18 @@ function getattrs(s::AbstractStore, p) JSON.parse(replace(String(maybecopy(atts)),": NaN,"=>": \"NaN\","); dicttype = Dict{String,Any}) end end -function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) + +function getattrs(::ZarrFormat{3}, s::AbstractStore, p) + md = s[p, "zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)), ": NaN," => ": \"NaN\",")) + return get(md, "attributes", Dict{String,Any}()) + end +end + +function writeattrs(::ZarrFormat{2}, s::AbstractStore, p, att::Dict; indent_json::Bool=false) b = IOBuffer() if indent_json @@ -123,19 +156,50 @@ function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) att end +function writeattrs(::ZarrFormat{3}, s::AbstractStore, p, att::Dict; indent_json::Bool=false) + # This is messy, we need to open zarr.json and replace the attributes section + md = s[p, "zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)), ": NaN," => ": \"NaN\",")) + end + md = Dict(md) + md["attributes"] = att + + b = IOBuffer() + + if indent_json + JSON.print(b, md, 4) + else + JSON.print(b, md) + end + + s[p, "zarr.json"] = take!(b) + att +end + is_zarr3(s::AbstractStore, p) = isinitialized(s,_concatpath(p,"zarr.json")) -is_zarr2(s::AbstractStore, p) = is_z2array(s, p) || is_z2group(s,p) -is_zgroup(s::AbstractStore, p) = is_z2group(s,p) -is_zarray(s::AbstractStore, p) = is_z2array(s,p) -is_z2group(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) -is_z2array(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) - -isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) -isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) -isinitialized(s::AbstractStore, i) = s[i] !== nothing - -getmetadata(s::AbstractStore, p,fill_as_missing) = Metadata(String(maybecopy(s[p,".zarray"])),fill_as_missing) -function writemetadata(s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool= false) +is_zarr2(s::AbstractStore, p) = is_zarray(ZarrFormat(Val(2)), s, p) || is_zgroup(ZarrFormat((Val(2))), s, p) + +is_zgroup(::ZarrFormat{2}, s::AbstractStore, p) = isinitialized(s, _concatpath(p, ".zgroup")) +is_zarray(::ZarrFormat{2}, s::AbstractStore, p) = isinitialized(s, _concatpath(p, ".zarray")) +is_zgroup(::ZarrFormat{3}, s::AbstractStore, p, metadata=getmetadata(s, p, false)) = + isinitialized(s, _concatpath(p, "zarr.json")) && + metadata.node_type == "group" +is_zarray(::ZarrFormat{3}, s::AbstractStore, p, metadata=getmetadata(s, p, false)) = + isinitialized(s, _concatpath(p, "zarr.json")) && + metadata.node_type == "array" + + +isinitialized(s::AbstractStore, p, i::AbstractString) = isinitialized(s, _concatpath(p, i)) +isinitialized(s::AbstractStore, i::AbstractString) = s[i] !== nothing + +getmetadata(::ZarrFormat{2}, s::AbstractStore, p, fill_as_missing) = Metadata(String(maybecopy(s[p, ".zarray"])), fill_as_missing) + +getmetadata(::ZarrFormat{3}, s::AbstractStore, p, fill_as_missing) = Metadata(String(maybecopy(s[p, "zarr.json"])), fill_as_missing) + +function writemetadata(::ZarrFormat{2}, s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool=false) met = IOBuffer() if indent_json @@ -147,6 +211,19 @@ function writemetadata(s::AbstractStore, p, m::AbstractMetadata; indent_json::Bo s[p,".zarray"] = take!(met) m end +function writemetadata(::ZarrFormat{3}, s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool=false) + met = IOBuffer() + + if indent_json + JSON.print(met, m, 4) + else + JSON.print(met, m) + end + + s[p, "zarr.json"] = take!(met) + m +end + ## Handling sequential vs parallel IO @@ -160,50 +237,50 @@ channelsize(s) = channelsize(store_read_strategy(s)) channelsize(::SequentialRead) = 0 channelsize(c::ConcurrentRead) = c.ntasks -read_items!(s::AbstractStore,c::AbstractChannel, p, i) = read_items!(s,c,store_read_strategy(s),p,i) -function read_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i) +read_items!(s::AbstractStore, c::AbstractChannel, e::ChunkEncoding, p, i) = read_items!(s, c, store_read_strategy(s), e, p, i) +function read_items!(s::AbstractStore, c::AbstractChannel, ::SequentialRead, e::ChunkEncoding, p, i) for ii in i - res = s[p,ii] + res = store_readchunk(s, p, ii, e) put!(c,(ii=>res)) end end -function read_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i) +function read_items!(s::AbstractStore, c::AbstractChannel, r::ConcurrentRead, e::ChunkEncoding, p, i) ntasks = r.ntasks #@show ntasks asyncmap(i,ntasks = ntasks) do ii #@show ii,objectid(current_task),p - res = s[p,ii] + res = store_readchunk(s, p, ii, e) #@show ii,length(res) put!(c,(ii=>res)) nothing end end -write_items!(s::AbstractStore,c::AbstractChannel, p, i) = write_items!(s,c,store_read_strategy(s),p,i) -function write_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i) +write_items!(s::AbstractStore, c::AbstractChannel, e::ChunkEncoding, p, i) = write_items!(s, c, store_read_strategy(s), e, p, i) +function write_items!(s::AbstractStore, c::AbstractChannel, ::SequentialRead, e::ChunkEncoding, p, i) for _ in 1:length(i) ii,data = take!(c) if data === nothing if isinitialized(s,p,ii) - delete!(s,p,ii) + store_deletechunk(s, p, ii, e) end else - s[p,ii] = data + store_writechunk(s, data, p, ii, e) end end close(c) end -function write_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i) +function write_items!(s::AbstractStore, c::AbstractChannel, r::ConcurrentRead, e::ChunkEncoding, p, i) ntasks = r.ntasks asyncmap(i,ntasks = ntasks) do _ ii,data = take!(c) if data === nothing if isinitialized(s,ii) - delete!(s,ii) + store_deletechunk(s, p, ii, e) end else - s[p,ii] = data + store_writechunk(s, data, p, ii, e) = data end nothing end @@ -217,7 +294,7 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) storageregexlist = Pair[] push!(storageregexlist, r"^s3://" => S3Store) -include("formattedstore.jl") +#include("formattedstore.jl") include("directorystore.jl") include("dictstore.jl") include("gcstore.jl") diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl index 782c1806..d908b340 100644 --- a/src/Storage/formattedstore.jl +++ b/src/Storage/formattedstore.jl @@ -13,75 +13,79 @@ default_sep(version) = version == 2 ? DS2 : error("Unknown version: $version") const DS = default_sep(DV) -# Chunk Key Encodings for Zarr v3 -# A Char is the separator for the default chunk key encoding -abstract type ChunkKeyEncoding end -struct V2ChunkKeyEncoding{SEP} <: ChunkKeyEncoding end -separator(c::Char) = c -separator(v2cke::V2ChunkKeyEncoding{SEP}) where SEP = SEP +# # Chunk Key Encodings for Zarr v3 +# # A Char is the separator for the default chunk key encoding +# abstract type ChunkKeyEncoding end +struct V2ChunkKeyEncoding <: ChunkKeyEncoding + sep::Char +end -""" - FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore +struct V3ChunkKeyEncoding <: ChunkKeyEncoding + sep::Char +end -FormattedStore wraps an AbstractStore to indicate a specific Zarr format. -The path of a chunk depends on the version and chunk key encoding. +# """ +# FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore -# Type Parameters +# FormattedStore wraps an AbstractStore to indicate a specific Zarr format. +# The path of a chunk depends on the version and chunk key encoding. -- V: Zarr format version -- CKE: Chunk key encoding or dimension separator. - CKE could be a `Char` or a subtype of `ChunkKeyEncoding`. -- STORE: Type of AbstractStore wrapped +# # Type Parameters -# Chunk Path Formats +# - V: Zarr format version +# - CKE: Chunk key encoding or dimension separator. +# CKE could be a `Char` or a subtype of `ChunkKeyEncoding`. +# - STORE: Type of AbstractStore wrapped -## Zarr version 2 +# # Chunk Path Formats -### '.' dimension separator (default) +# ## Zarr version 2 -Chunks are encoded as "1.2.3" +# ### '.' dimension separator (default) -### '/' dimension separator +# Chunks are encoded as "1.2.3" -Chunks are encoded as "1/2/3" +# ### '/' dimension separator -## Zarr version 3 +# Chunks are encoded as "1/2/3" -### '/' dimension separator (default) +# ## Zarr version 3 -Chunks are encoded as "c/1/2/3" +# ### '/' dimension separator (default) -### '.' dimension separator +# Chunks are encoded as "c/1/2/3" -Chunks are encoded as "c.1.2.3" +# ### '.' dimension separator -### V2ChunkKeyEncoding{SEP} +# Chunks are encoded as "c.1.2.3" -See Zarr version 2 -""" -struct FormattedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore - parent::STORE -end -FormattedStore(args...) = FormattedStore{DV,DS}(args...) -FormattedStore(s::FormattedStore) = s -FormattedStore{V}(args...) where V = FormattedStore{V, default_sep(V)}(args...) -FormattedStore{V}(s::FormattedStore{<:Any,S}) where {V,S} = FormattedStore{V, S}(s) -FormattedStore{<: Any, S}(args...) where S = FormattedStore{DV, S}(args...) -FormattedStore{<: Any, S}(s::FormattedStore{V}) where {V,S} = FormattedStore{V, S}(s) -function FormattedStore{V,S}(store::AbstractStore) where {V,S} - return FormattedStore{V,S,typeof(store)}(store) -end -function FormattedStore{V,S}(store::FormattedStore) where {V,S} - p = parent(store) - return FormattedStore{V,S,typeof(p)}(p) -end +# ### V2ChunkKeyEncoding{SEP} + +# See Zarr version 2 +# """ +# struct FormattedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore +# parent::STORE +# end +# FormattedStore(args...) = FormattedStore{DV,DS}(args...) +# FormattedStore(s::FormattedStore) = s +# FormattedStore{V}(args...) where V = FormattedStore{V, default_sep(V)}(args...) +# FormattedStore{V}(s::FormattedStore{<:Any,S}) where {V,S} = FormattedStore{V, S}(s) +# FormattedStore{<: Any, S}(args...) where S = FormattedStore{DV, S}(args...) +# FormattedStore{<: Any, S}(s::FormattedStore{V}) where {V,S} = FormattedStore{V, S}(s) +# function FormattedStore{V,S}(store::AbstractStore) where {V,S} +# return FormattedStore{V,S,typeof(store)}(store) +# end +# function FormattedStore{V,S}(store::FormattedStore) where {V,S} +# p = parent(store) +# return FormattedStore{V,S,typeof(p)}(p) +# end -Base.parent(store::FormattedStore) = store.parent +# Base.parent(store::FormattedStore) = store.parent -@inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) -@inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) -@inline citostring(i::CartesianIndex, ::Int, ::Type{V2ChunkKeyEncoding{S}}) where S = citostring(i, 2, S) -citostring(i::CartesianIndex, s::FormattedStore{V, S}) where {V,S} = citostring(i, V, S) +@inline citostring(i::CartesianIndex, cke::V3ChunkKeyEncoding) = "c$(cke.sep)" * join(reverse((i - oneunit(i)).I), cke.sep) +@inline citostring(::CartesianIndex{0}, cke::V3ChunkKeyEncoding) = "c$(cke.sep)0" +@inline citostring(i::CartesianIndex, cke::V2ChunkKeyEncoding) = join(reverse((i - oneunit(i)).I), cke.sep) +@inline citostring(::CartesianIndex{0}, cke::V2ChunkKeyEncoding) = "0" Base.getindex(s::FormattedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] Base.delete!(s::FormattedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) diff --git a/src/ZArray.jl b/src/ZArray.jl index aca7ac35..67817897 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -30,10 +30,8 @@ Base.IndexStyle(::Type{<:SenMissArray})=Base.IndexLinear() # Struct representing a Zarr Array in Julia, note that # chunks(chunk size) and size are always in Julia column-major order -# Currently this is not an AbstractArray, because indexing single elements is -# would be really slow, although most AbstractArray interface functions are implemented -struct ZArray{T, N, C<:Compressor, S<:AbstractStore} <: AbstractDiskArray{T,N} - metadata::AbstractMetadata{T, N, C} +struct ZArray{T,N,S<:AbstractStore,M<:AbstractMetadata{T,N}} <: AbstractDiskArray{T,N} + metadata::M storage::S path::String attrs::Dict @@ -117,15 +115,22 @@ function zinfo(io::IO,z::ZArray) end end -function ZArray(s::T, mode="r",path="";fill_as_missing=false) where T <: AbstractStore - metadata = getmetadata(s,path,fill_as_missing) - attrs = getattrs(s,path) +function ZArray(s::T, mode="r", path="", zarr_format=:auto; fill_as_missing=false) where T<:AbstractStore + zv = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end + metadata = getmetadata(zv, s, path, fill_as_missing) + attrs = getattrs(zv, s, path) writeable = mode == "w" startswith(path,"/") && error("Paths should never start with a leading '/'") - ZArray{eltype(metadata), length(metadata.shape[]), typeof(metadata.compressor), T}( - metadata, s, path, attrs, writeable) + ZArray(metadata, s, string(path), attrs, writeable) end +zarr_format(z::ZArray) = zarr_format(z.metadata) +dimension_separator(z::ZArray) = dimension_separator(z.metadata) + """ trans_ind(r, bs) @@ -174,7 +179,7 @@ function readblock!(aout::AbstractArray{<:Any,N}, z::ZArray{<:Any, N}, r::Cartes c = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) task = @async begin - read_items!($z.storage,c, $z.path, $blockr) + read_items!($(z.storage), c, $(z.metadata.chunk_encoding), $(z.path), $(blockr)) end bind(c,task) @@ -210,14 +215,14 @@ function writeblock!(ain::AbstractArray{<:Any,N}, z::ZArray{<:Any, N}, r::Cartes readchannel = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) readtask = @async begin - read_items!(z.storage,readchannel, z.path, blockr) + read_items!(z.storage, readchannel, z.metadata.chunk_encoding, z.path, blockr) end bind(readchannel,readtask) writechannel = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) writetask = @async begin - write_items!(z.storage,writechannel,z.path,blockr) + write_items!(z.storage, writechannel, z.metadata.chunk_encoding, z.path, blockr) end bind(writechannel,writetask) @@ -332,17 +337,12 @@ function zcreate(::Type{T}, dims::Integer...; kwargs... ) where T - if dimension_separator isa AbstractString - # Convert AbstractString to Char - dimension_separator = only(dimension_separator) - end - if path===nothing - store = FormattedStore{zarr_format, dimension_separator}(DictStore()) + store = DictStore() else - store = FormattedStore{zarr_format, dimension_separator}(DirectoryStore(joinpath(path,name))) + store = DirectoryStore(joinpath(path, name)) end - zcreate(T, store, dims...; zarr_format, kwargs...) + zcreate(T, store, dims...; zarr_format, dimension_separator, kwargs...) end function zcreate(::Type{T},storage::AbstractStore, @@ -360,12 +360,15 @@ function zcreate(::Type{T},storage::AbstractStore, dimension_separator=nothing ) where {T} + v = ZarrFormat(zarr_format) if isnothing(dimension_separator) - dimension_separator = Zarr.dimension_separator(storage) - elseif dimension_separator != Zarr.dimension_separator(storage) - error("The dimension separator keyword value, $dimension_separator, - must agree with the dimension separator type parameter, $(Zarr.dimension_separator(storage))") + dimension_separator = default_sep(v) + end + if dimension_separator isa AbstractString + # Convert AbstractString to Char + dimension_separator = only(dimension_separator) end + chunk_encoding = ChunkEncoding(dimension_separator, default_prefix(v)) length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) @@ -374,13 +377,12 @@ function zcreate(::Type{T},storage::AbstractStore, # Create a dummy array to use with Metadata constructor # This allows us to leverage the multiple dispatch in Metadata constructors dummy_array = Array{T,N}(undef, dims...) - metadata = Metadata(dummy_array, chunks; - zarr_format=zarr_format, + metadata = Metadata(dummy_array, chunks, v; compressor=compressor, fill_value=fill_value, filters=filters, fill_as_missing=fill_as_missing, - dimension_separator=dimension_separator + chunk_encoding=chunk_encoding ) # Extract the element type from the metadata (handles T2 calculation) @@ -388,12 +390,11 @@ function zcreate(::Type{T},storage::AbstractStore, isemptysub(storage,path) || error("$storage $path is not empty") - writemetadata(storage, path, metadata, indent_json=indent_json) + writemetadata(v, storage, path, metadata, indent_json=indent_json) - writeattrs(storage, path, attrs, indent_json=indent_json) + writeattrs(v, storage, path, attrs, indent_json=indent_json) - ZArray{T2, N, typeof(compressor), typeof(storage)}( - metadata, storage, path, attrs, writeable) + ZArray(metadata, storage, path, attrs, writeable) end filterfromtype(::Type{<:Any}) = nothing @@ -436,7 +437,7 @@ function zzeros(T,dims...;kwargs...) data_encoded = compress_raw(as,z) p = z.path for i in chunkindices(z) - z.storage[p,i] = data_encoded + store_writechunk(z.storage, data_encoded, p, i, z.metadata.chunk_encoding) end z end @@ -454,9 +455,9 @@ function Base.resize!(z::ZArray{T,N}, newsize::NTuple{N}) where {T,N} z.metadata.shape[] = newsize #Check if array was shrunk if any(map(<,newsize, oldsize)) - prune_oob_chunks(z.storage,z.path,oldsize,newsize, z.metadata.chunks) + prune_oob_chunks(z.storage, z.path, oldsize, newsize, z.metadata.chunks, z.metadata.chunk_encoding) end - writemetadata(z.storage, z.path, z.metadata) + writemetadata(zarr_format(z), z.storage, z.path, z.metadata) nothing end Base.resize!(z::ZArray, newsize::Integer...) = resize!(z,newsize) @@ -504,7 +505,7 @@ function prune_oob_chunks(s::AbstractStore,path,oldsize, newsize, chunks) allchunkranges = map(i->1:fld1(oldsize[i],chunks[i]),1:length(oldsize)) r = (allchunkranges[1:idim-1]..., delrange, allchunkranges[idim+1:end]...) for cI in CartesianIndices(r) - delete!(s,path,cI) + store_deletechunk(s, path, cI, chunk_encoding) end end end diff --git a/src/ZGroup.jl b/src/ZGroup.jl index 031d33fb..f4244172 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -13,32 +13,29 @@ ZGroup(storage, path::AbstractString, arrays, groups, attrs, writeable) = zname(g::ZGroup) = zname(g.path) + + #Open an existing ZGroup -function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: AbstractStore +function ZGroup(s::T, mode="r", path="", zarr_format=:auto; fill_as_missing=false) where T<:AbstractStore arrays = Dict{String, ZArray}() groups = Dict{String, ZGroup}() - + zv = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end for d in subdirs(s,path) dshort = split(d,'/')[end] subpath = _concatpath(path,dshort) - if is_zarr2(s, subpath) - # check for zarr2 first - elseif is_zarr3(s, subpath) - s = set_zarr_format(s, 3) - end - if is_zarray(s, subpath) - meta = getmetadata(s, subpath, false) - if dimension_separator(s) != meta.dimension_separator - s = set_dimension_separator(s, meta.dimension_separator) - end - m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) + if is_zarray(zv, s, subpath) + m = zopen_noerr(s, mode, zv, path=_concatpath(path, dshort), fill_as_missing=fill_as_missing) arrays[dshort] = m elseif is_zgroup(s, subpath) - m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) + m = zopen_noerr(s, mode, zv, path=_concatpath(path, dshort), fill_as_missing=fill_as_missing) groups[dshort] = m end end - attrs = getattrs(s,path) + attrs = getattrs(zv, s, path) startswith(path,"/") && error("Paths should never start with a leading '/'") ZGroup(s, path, arrays, groups, attrs,mode=="w") end @@ -50,19 +47,20 @@ Works like `zopen` with the single difference that no error is thrown when the path or store does not point to a valid zarr array or group, but nothing is returned instead. """ -function zopen_noerr(s::AbstractStore, mode="r"; +function zopen_noerr(s::AbstractStore, mode, zv::ZarrFormat; consolidated = false, path="", lru = 0, - fill_as_missing) - consolidated && isinitialized(s,".zmetadata") && return zopen(ConsolidatedStore(s, path), mode, path=path,lru=lru,fill_as_missing=fill_as_missing) - if lru !== 0 - error("LRU caches are not supported anymore by the current Zarr version. Please use an earlier version of Zarr for now and open an issue at Zarr.jl if you need this functionality") - end - if is_zarray(s, path) - return ZArray(s,mode,path;fill_as_missing=fill_as_missing) - elseif is_zgroup(s,path) - return ZGroup(s,mode,path;fill_as_missing=fill_as_missing) + fill_as_missing=false) + + consolidated && isinitialized(s, ".zmetadata") && return zopen(ConsolidatedStore(s, path), mode, path=path, lru=lru, fill_as_missing=fill_as_missing) + if lru !== 0 + error("LRU caches are not supported anymore by the current Zarr version. Please use an earlier version of Zarr for now and open an issue at Zarr.jl if you need this functionality") + end + if is_zarray(zv, s, path) + return ZArray(s, mode, path, zv; fill_as_missing=fill_as_missing) + elseif is_zgroup(zv, s, path) + return ZGroup(s, mode, path, zv; fill_as_missing=fill_as_missing) else return nothing end @@ -87,6 +85,7 @@ function Base.getindex(g::ZGroup, k) end end + """ zopen(s::AbstractStore, mode="r"; consolidated = false, path = "", lru = 0) @@ -95,20 +94,29 @@ Zarr will search for a consolidated metadata field as created by the python zarr `consolidate_metadata` function. This can substantially speed up metadata parsing of large zarr groups. Setting `lru` to a value > 0 means that chunks that have been accessed before will be cached and consecutive reads will happen from the cache. -Here, `lru` denotes the number of chunks that remain in memory. +Here, `lru` denotes the number of chunks that remain in memory. The expected zarr version +can be supplied through `zarr_format` and defaults to `:auto` which tries to detect +if the zarr version is v2 or v3. """ function zopen(s::AbstractStore, mode="r"; + zarr_format=:auto, consolidated = false, path = "", lru = 0, fill_as_missing = false) - # add interfaces to Stores later - r = zopen_noerr(s,mode; consolidated=consolidated, path=path, lru=lru, fill_as_missing=fill_as_missing) - if r === nothing - throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup")) - else - return r - end + + zarr_format = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end + # add interfaces to Stores later + r = zopen_noerr(s, mode, zarr_format; consolidated=consolidated, path=path, lru=lru, fill_as_missing=fill_as_missing) + if r === nothing + throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup")) + else + return r + end end """ @@ -127,21 +135,8 @@ function storefromstring(s, create=true) return storefromstring(t,s,create) end end - if create - return FormattedStore(DirectoryStore(s)), "" - elseif isdir(s) - # parse metadata to determine store kind - temp_store = DirectoryStore(s) - if is_zarr3(temp_store, "") - temp_store = set_zarr_format(temp_store, 3) - end - if is_zarray(temp_store, "") - meta = getmetadata(temp_store, "", false) - store = FormattedStore{meta.zarr_format, meta.dimension_separator}(temp_store) - else - store = FormattedStore(temp_store) - end - return store, "" + if create || isdir(s) + return DirectoryStore(s), "" else throw(ArgumentError("Path $s is not a directory.")) end @@ -152,7 +147,7 @@ end Create a new zgroup in the store `s` """ -function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bool= false) +function zgroup(s::AbstractStore, path::String="", zarr_format=ZarrFormat(2); attrs=Dict(), indent_json::Bool=false) d = Dict("zarr_format"=>DV) isemptysub(s, path) || error("Store is not empty") b = IOBuffer() @@ -164,7 +159,7 @@ function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bo end s[path,".zgroup"]=take!(b) - writeattrs(s,path,attrs, indent_json=indent_json) + writeattrs(DV, s, path, attrs, indent_json=indent_json) ZGroup(s, path, Dict{String,ZArray}(), Dict{String,ZGroup}(), attrs,true) end diff --git a/src/Zarr.jl b/src/Zarr.jl index 1783bdf9..7e575266 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -3,6 +3,14 @@ module Zarr import JSON import Blosc +struct ZarrFormat{V} + version::Val{V} +end +@inline ZarrFormat(v::Int) = ZarrFormat(Val(v)) +ZarrFormat(v::ZarrFormat) = v +#Default Zarr Version +const DV = ZarrFormat(Val(2)) + include("metadata.jl") include("metadata3.jl") include("Compressors/Compressors.jl") diff --git a/src/metadata.jl b/src/metadata.jl index ae01ae56..276e9e25 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -86,6 +86,11 @@ function typestr(s::AbstractString, filterlist=nothing) end end +struct ChunkEncoding + sep::Char + prefix::Bool +end + """Metadata configuration of the stored array Each array requires essential configuration metadata to be stored, enabling correct @@ -97,16 +102,15 @@ value of the ".zarray" key within an array store. * N - dimensionality of the array * C - compressor * F - filters -* S - dimension separator # See Also https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ -abstract type AbstractMetadata{T, N, C, F, S} end +abstract type AbstractMetadata{T,N,C,F} end """Metadata for Zarr version 2 arrays""" -struct MetadataV2{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} +struct MetadataV2{T,N,C,F} <: AbstractMetadata{T,N,C,F} zarr_format::Int node_type::String shape::Base.RefValue{NTuple{N, Int}} @@ -116,42 +120,19 @@ struct MetadataV2{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function MetadataV2{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + chunk_encoding::ChunkEncoding + function MetadataV2{T2,N,C,F}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) where {T2,N,C,F} zarr_format == 2 || throw(ArgumentError("MetadataV2 only functions if zarr_format == 2")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) - end - function MetadataV2{T2, N, C, F}( - zarr_format, - node_type, - shape, - chunks, - dtype, - compressor, - fill_value, - order, - filters, - dimension_separator::Char = '.' - ) where {T2,N,C,F} - return MetadataV2{T2, N, C, F, dimension_separator}( - zarr_format, - node_type, - shape, - chunks, - dtype, - compressor, - fill_value, - order, - filters - ) + new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) end end """Metadata for Zarr version 3 arrays""" -struct MetadataV3{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} +struct MetadataV3{T,N,C,F} <: AbstractMetadata{T,N,C,F} zarr_format::Int node_type::String shape::Base.RefValue{NTuple{N, Int}} @@ -161,29 +142,20 @@ struct MetadataV3{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function MetadataV3{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + chunk_encoding::ChunkEncoding + function MetadataV3{T2,N,C,F}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) where {T2,N,C,F} zarr_format == 3 || throw(ArgumentError("MetadataV3 only functions if zarr_format == 3")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) end end # Type alias for backward compatibility const Metadata = AbstractMetadata -const DimensionSeparatedMetadata{S} = AbstractMetadata{<: Any, <: Any, <: Any, <: Any, S} - -function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where S - if name == :dimension_separator - return S - end - return getfield(m, name) -end -Base.propertynames(m::AbstractMetadata) = (fieldnames(typeof(m))..., :dimension_separator) - #To make unit tests pass with ref shape import Base.== function ==(m1::AbstractMetadata, m2::AbstractMetadata) @@ -196,13 +168,12 @@ function ==(m1::AbstractMetadata, m2::AbstractMetadata) m1.fill_value == m2.fill_value && m1.order == m2.order && m1.filters == m2.filters && - m1.dimension_separator == m2.dimension_separator + m1.chunk_encoding == m2.chunk_encoding end "Construct Metadata based on your data" -function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; - zarr_format::Integer=2, +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, zarr_format=DV; node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, @@ -211,29 +182,29 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; fill_as_missing = false, dimension_separator::Char = '.' ) where {T, N, C} - return Metadata(A, chunks, Val(zarr_format); + return Metadata(A, chunks, ZarrFormat(zarr_format); node_type=node_type, compressor=compressor, fill_value=fill_value, order=order, filters=filters, fill_as_missing=fill_as_missing, - dimension_separator=dimension_separator + chunk_encoding=ChunkEncoding(dimension_separator, default_prefix(ZarrFormat(zarr_format))) ) end # V2 constructor -function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{2}; +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, ::ZarrFormat{2}; node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', filters::F=nothing, fill_as_missing = false, - dimension_separator::Char = '.' + chunk_encoding=ChunkEncoding('.', false) ) where {T, N, C, F} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - MetadataV2{T2, N, C, typeof(filters), dimension_separator}( + MetadataV2{T2,N,C,typeof(filters)}( 2, node_type, size(A), @@ -242,18 +213,19 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{2}; compressor, fill_value, order, - filters + filters, + chunk_encoding, ) end -function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{3}; +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, ::ZarrFormat{3}; node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', filters::F=nothing, fill_as_missing = false, - dimension_separator::Char = '.' + chunk_encoding::ChunkEncoding=ChunkEncoding('/', true) ) where {T, N, C, F} return Metadata3(A, chunks; node_type=node_type, @@ -262,7 +234,7 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{3}; order=order, filters=filters, fill_as_missing=fill_as_missing, - dimension_separator=dimension_separator + chunk_encoding=chunk_encoding ) end @@ -271,17 +243,12 @@ Metadata(s::Union{AbstractString, IO}, fill_as_missing) = Metadata(JSON.parse(s; "Construct Metadata from Dict" function Metadata(d::AbstractDict, fill_as_missing) zarr_format = d["zarr_format"]::Int - if zarr_format == 2 - return Metadata(d, fill_as_missing, Val(2)) - elseif zarr_format == 3 - return Metadata(d, fill_as_missing, Val(3)) - else - throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) - end + zarr_format ∉ (2, 3) && throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + return Metadata(d, fill_as_missing, ZarrFormat(zarr_format)) end # V2 constructor from Dict -function Metadata(d::AbstractDict, fill_as_missing, ::Val{2}) +function Metadata(d::AbstractDict, fill_as_missing, ::ZarrFormat{2}) # Zarr v2 metadata is only for arrays node_type = "array" @@ -305,9 +272,9 @@ function Metadata(d::AbstractDict, fill_as_missing, ::Val{2}) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} - S = only(get(d, "dimension_separator", '.')) + dim_sep = only(get(d, "dimension_separator", '.')) - MetadataV2{TU, N, C, F, S}( + MetadataV2{TU,N,C,F}( d["zarr_format"], node_type, NTuple{N, Int}(d["shape"]) |> reverse, @@ -317,11 +284,12 @@ function Metadata(d::AbstractDict, fill_as_missing, ::Val{2}) fv, first(d["order"]), filters, + ChunkEncoding(dim_sep, false), ) end # V3 constructor from Dict - delegate to metadata3.jl -function Metadata(d::AbstractDict, fill_as_missing, ::Val{3}) +function Metadata(d::AbstractDict, fill_as_missing, ::ZarrFormat{3}) return Metadata3(d, fill_as_missing) end @@ -337,7 +305,7 @@ function JSON.lower(md::MetadataV2) "fill_value" => fill_value_encoding(md.fill_value), "order" => md.order, "filters" => md.filters, - "dimension_separator" => md.dimension_separator + "dimension_separator" => md.chunk_encoding.sep ) end diff --git a/src/metadata3.jl b/src/metadata3.jl index 5a45c69e..2765db55 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -64,7 +64,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) end end - return MetadataV3{Int,0,Nothing,Nothing,'/'}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing) + return MetadataV3{Int,0,Nothing,Nothing}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing, ChunkEncoding('/', true)) end # Array keys @@ -110,9 +110,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) # Chunk Key Encoding chunk_key_encoding = d["chunk_key_encoding"] - if chunk_key_encoding["name"] == "default" - elseif chunk_key_encoding["name"] == "v2" - else + if chunk_key_encoding["name"] ∉ ("default", "v2") throw(ArgumentError("Unknown chunk_key_encoding of name, $(chunk_key_encoding["name"])")) end @@ -213,9 +211,9 @@ function Metadata3(d::AbstractDict, fill_as_missing) # V2 uses '.' while default CKE uses '/' by default if chunk_key_encoding["name"] == "v2" separator = only(get(cke_configuration, "separator", '.')) - S = V2ChunkKeyEncoding{separator}() + chunk_encoding = ChunkEncoding(separator, false) elseif chunk_key_encoding["name"] == "default" - S = only(get(cke_configuration, "separator", '/')) + chunk_encoding = ChunkEncoding(only(get(cke_configuration, "separator", '/')), true) end MetadataV3{TU, N, C, F, S}( @@ -228,6 +226,7 @@ function Metadata3(d::AbstractDict, fill_as_missing) fv, order, filters, + chunk_encoding, ) end @@ -246,7 +245,7 @@ function Metadata3(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; if fill_value === nothing fill_value = zero(T) end - MetadataV3{T2, N, C, typeof(filters), dimension_separator}( + MetadataV3{T2,N,C,typeof(filters)}( 3, node_type, size(A), @@ -255,7 +254,8 @@ function Metadata3(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; compressor, fill_value, order, - filters + filters, + ChunkEncoding(dimension_separator, true) ) end diff --git a/test/runtests.jl b/test/runtests.jl index b705c8cd..f5342c5c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,9 +12,7 @@ using Dates @testset "ZArray" begin @testset "fields" begin z = zzeros(Int64, 2, 3) - @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.FormattedStore{2, '.', Zarr.DictStore}} - + @test z isa ZArray{Int64,2,Zarr.DictStore,Zarr.MetadataV2{Int64,2,Zarr.BloscCompressor,Nothing}} @test :a ∈ propertynames(z.storage) @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @@ -32,8 +30,7 @@ using Dates @test z.metadata.compressor.shuffle === 1 @test z.attrs == Dict{Any, Any}() @test z.writeable === true - @test z.metadata.dimension_separator === Zarr.DS - @test :dimension_separator ∈ propertynames(z.metadata) + @test z.metadata.chunk_encoding === Zarr.ChunkEncoding(Zarr.default_sep(Zarr.DV), Zarr.default_prefix(Zarr.DV)) @test_throws ArgumentError zzeros(Int64,2,3, chunks = (0,1)) @test_throws ArgumentError zzeros(Int64,0,-1) @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), order = 'F') @@ -41,9 +38,7 @@ using Dates @testset "methods" begin z = zzeros(Int64, 2, 3) - @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.FormattedStore{2, '.', Zarr.DictStore}} - + @test z isa ZArray{Int64,2,Zarr.DictStore,Zarr.MetadataV2{Int64,2,Zarr.BloscCompressor,Nothing}} @test eltype(z) === Int64 @test ndims(z) === 2 @test size(z) === (2, 3) @@ -64,7 +59,7 @@ using Dates compressor=Zarr.NoCompressor()) @test z.metadata.compressor === Zarr.NoCompressor() - @test z.storage === Zarr.FormattedStore{2 ,'.'}(Zarr.DirectoryStore("$dir/$name")) + @test z.storage === Zarr.DirectoryStore("$dir/$name") @test isdir("$dir/$name") @test ispath("$dir/$name/.zarray") @test ispath("$dir/$name/.zattrs") @@ -94,8 +89,8 @@ end store = DirectoryStore(tempname()) g = zgroup(store,"mygroup") g2 = zgroup(g,"asubgroup",attrs = Dict("a1"=>5)) - @test Zarr.is_zgroup(store,"mygroup") - @test Zarr.is_zgroup(store,"mygroup/asubgroup") + @test Zarr.is_zgroup(Zarr.DV, store, "mygroup") + @test Zarr.is_zgroup(Zarr.DV, store, "mygroup/asubgroup") @test g2.attrs["a1"]==5 @test isdir(joinpath(store.folder,"mygroup")) @test isdir(joinpath(store.folder,"mygroup","asubgroup")) @@ -183,7 +178,7 @@ end @test all(ismissing,amiss[:,2]) @test all(i->isequal(i...),zip(amiss[1:3,4],[1,missing,3])) # Test that chunk containing only missings is not initialized - @test !Zarr.isinitialized(amiss.storage,Zarr.citostring(CartesianIndex((1,5)))) + @test !Zarr.isinitialized(amiss.storage, Zarr.citostring(Zarr.ChunkEncoding('/', false), CartesianIndex((1, 5)))) # amiss = zcreate(Int64, 10,10,chunks=(5,2), fill_value=-1, fill_as_missing=false) amiss[:,1] = 1:10 @@ -195,7 +190,7 @@ end @test all(==(-1),amiss[:,2]) @test all(i->isequal(i...),zip(amiss[1:3,4],[1,-1,3])) # Test that chunk containing only fill values is not initialized - @test !Zarr.isinitialized(amiss.storage,Zarr.citostring(CartesianIndex((1,5)))) + @test !Zarr.isinitialized(amiss.storage, Zarr.citostring(Zarr.ChunkEncoding('/', false), CartesianIndex((1, 5)))) end @testset "resize" begin From 7f8b536aaf9841e4e071f076b2d636a0e4018020 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 28 Nov 2025 11:34:12 +0100 Subject: [PATCH 38/39] fix v2 tests --- src/Storage/Storage.jl | 30 +------------- src/ZArray.jl | 4 +- src/ZGroup.jl | 8 +++- src/Zarr.jl | 2 + src/chunkencoding.jl | 27 ++++++++++++ src/metadata.jl | 10 ++--- src/metadata3.jl | 2 +- test/storage.jl | 94 +++++++++++++++++++++--------------------- 8 files changed, 93 insertions(+), 84 deletions(-) create mode 100644 src/chunkencoding.jl diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index a816728c..0543bb2e 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -78,34 +78,7 @@ Returns the keys of files in the given store. """ function subkeys end -# Default Zarr v2 separator -const DS2 = '.' -# Default Zarr v3 separator -const DS3 = '/' - -default_sep(::ZarrFormat{2}) = DS2 -default_sep(::ZarrFormat{3}) = DS3 -default_prefix(::ZarrFormat{2}) = false -default_prefix(::ZarrFormat{3}) = true -const DS = default_sep(DV) - -ZarrFormat(s::AbstractStore, path) = is_zarr2(s, path) ? ZarrFormat(2) : - is_zarr3(s, path) ? ZarrFormat(3) : - throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup in a recognized zarr format.")) - - -@inline function citostring(e::ChunkEncoding, i::CartesianIndex) - if e.prefix - "c$(e.sep)" * join(reverse((i - oneunit(i)).I), e.sep) - else - join(reverse((i - oneunit(i)).I), e.sep) - end -end -@inline citostring(e::ChunkEncoding, ::CartesianIndex{0}) = e.prefix ? "c$(e.sep)0" : "0" - -_concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s - -# Function to read a chunk from store s +# Function to construct the full path to a chunk given the base path, Cartesian Index i, and the chunk ecoding store_readchunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] store_deletechunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = delete!(s, p, citostring(e, i)) store_writechunk(s::AbstractStore, v, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] = v @@ -119,7 +92,6 @@ Base.haskey(s::AbstractStore, k::AbstractString) = isinitialized(s, k) Base.setindex!(s::AbstractStore, v, p, i::AbstractString) = setindex!(s, v, _concatpath(p, i)) - maybecopy(x) = copy(x) maybecopy(x::String) = x diff --git a/src/ZArray.jl b/src/ZArray.jl index 67817897..ca687f25 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -93,7 +93,7 @@ nobytes(z::ZArray{<:String}) = "unknown" zinfo(z::ZArray) = zinfo(stdout,z) function zinfo(io::IO,z::ZArray) ninit = sum(chunkindices(z)) do i - isinitialized(z.storage,z.path,i) + store_isinitialized(z.storage, z.path, i, z.metadata.chunk_encoding) end allinfos = [ "Type" => "ZArray", @@ -498,7 +498,7 @@ function Base.append!(z::ZArray{<:Any, N},a;dims = N) where N nothing end -function prune_oob_chunks(s::AbstractStore,path,oldsize, newsize, chunks) +function prune_oob_chunks(s::AbstractStore, path, oldsize, newsize, chunks, chunk_encoding) dimstoshorten = findall(map(<,newsize, oldsize)) for idim in dimstoshorten delrange = (fld1(newsize[idim],chunks[idim])+1):(fld1(oldsize[idim],chunks[idim])) diff --git a/src/ZGroup.jl b/src/ZGroup.jl index f4244172..a85bc3c8 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -40,6 +40,12 @@ function ZGroup(s::T, mode="r", path="", zarr_format=:auto; fill_as_missing=fals ZGroup(s, path, arrays, groups, attrs,mode=="w") end +#Function to guess a Zarr format from a store and a path, useful for guessing format when trying to open a group/array +ZarrFormat(s::AbstractStore, path) = is_zarr2(s, path) ? ZarrFormat(2) : + is_zarr3(s, path) ? ZarrFormat(3) : + throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup in a recognized zarr format.")) + + """ zopen_noerr(AbstractStore, mode = "r"; consolidated = false) @@ -148,7 +154,7 @@ end Create a new zgroup in the store `s` """ function zgroup(s::AbstractStore, path::String="", zarr_format=ZarrFormat(2); attrs=Dict(), indent_json::Bool=false) - d = Dict("zarr_format"=>DV) + d = Dict("zarr_format" => Int(DV)) isemptysub(s, path) || error("Store is not empty") b = IOBuffer() diff --git a/src/Zarr.jl b/src/Zarr.jl index 7e575266..40196c96 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -6,11 +6,13 @@ import Blosc struct ZarrFormat{V} version::Val{V} end +Base.Int(v::ZarrFormat{V}) where V = V @inline ZarrFormat(v::Int) = ZarrFormat(Val(v)) ZarrFormat(v::ZarrFormat) = v #Default Zarr Version const DV = ZarrFormat(Val(2)) +include("chunkencoding.jl") include("metadata.jl") include("metadata3.jl") include("Compressors/Compressors.jl") diff --git a/src/chunkencoding.jl b/src/chunkencoding.jl new file mode 100644 index 00000000..911cd36b --- /dev/null +++ b/src/chunkencoding.jl @@ -0,0 +1,27 @@ + +struct ChunkEncoding + sep::Char + prefix::Bool +end + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(::ZarrFormat{2}) = DS2 +default_sep(::ZarrFormat{3}) = DS3 +default_prefix(::ZarrFormat{2}) = false +default_prefix(::ZarrFormat{3}) = true +const DS = default_sep(DV) + +@inline function citostring(e::ChunkEncoding, i::CartesianIndex) + if e.prefix + "c$(e.sep)" * join(reverse((i - oneunit(i)).I), e.sep) + else + join(reverse((i - oneunit(i)).I), e.sep) + end +end +@inline citostring(e::ChunkEncoding, ::CartesianIndex{0}) = e.prefix ? "c$(e.sep)0" : "0" + +_concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s diff --git a/src/metadata.jl b/src/metadata.jl index 276e9e25..da94b046 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -86,10 +86,6 @@ function typestr(s::AbstractString, filterlist=nothing) end end -struct ChunkEncoding - sep::Char - prefix::Bool -end """Metadata configuration of the stored array @@ -108,6 +104,8 @@ value of the ".zarray" key within an array store. https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ abstract type AbstractMetadata{T,N,C,F} end +Base.ndims(::AbstractMetadata{<:Any,N}) where N = N + """Metadata for Zarr version 2 arrays""" struct MetadataV2{T,N,C,F} <: AbstractMetadata{T,N,C,F} @@ -130,6 +128,7 @@ struct MetadataV2{T,N,C,F} <: AbstractMetadata{T,N,C,F} new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) end end +zarr_format(::MetadataV2) = ZarrFormat(Val(2)) """Metadata for Zarr version 3 arrays""" struct MetadataV3{T,N,C,F} <: AbstractMetadata{T,N,C,F} @@ -152,6 +151,7 @@ struct MetadataV3{T,N,C,F} <: AbstractMetadata{T,N,C,F} new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) end end +zarr_format(::MetadataV3) = ZarrFormat(Val(3)) # Type alias for backward compatibility const Metadata = AbstractMetadata @@ -296,7 +296,7 @@ end "Describes how to lower Metadata to JSON, used in json(::Metadata)" function JSON.lower(md::MetadataV2) Dict{String, Any}( - "zarr_format" => md.zarr_format, + "zarr_format" => Int(md.zarr_format), "node_type" => md.node_type, "shape" => md.shape[] |> reverse, "chunks" => md.chunks |> reverse, diff --git a/src/metadata3.jl b/src/metadata3.jl index 2765db55..c7b3caf3 100644 --- a/src/metadata3.jl +++ b/src/metadata3.jl @@ -325,7 +325,7 @@ function lower3(md::MetadataV3{T}) where T end Dict{String, Any}( - "zarr_format" => md.zarr_format, + "zarr_format" => Int(md.zarr_format), "node_type" => md.node_type, "shape" => md.shape[] |> reverse, "data_type" => typestr3(T), diff --git a/test/storage.jl b/test/storage.jl index 1a06d405..ad1ee394 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -15,31 +15,27 @@ using AWSS3 end @testset "Version and Dimension Separator" begin - v2cke_period = Zarr.V2ChunkKeyEncoding{'.'} - v2cke_slash = Zarr.V2ChunkKeyEncoding{'/'} + dot_noprefix = Zarr.ChunkEncoding('.', false) + dot_prefix = Zarr.ChunkEncoding('.', true) + slash_noprefix = Zarr.ChunkEncoding('/', false) + slash_prefix = Zarr.ChunkEncoding('/', true) let ci = CartesianIndex() - @test Zarr.citostring(ci, 2, '.') == "0" - @test Zarr.citostring(ci, 2, '/') == "0" - @test Zarr.citostring(ci, 3, v2cke_period) == "0" - @test Zarr.citostring(ci, 3, v2cke_slash) == "0" - @test Zarr.citostring(ci, 3, '.') == "c.0" - @test Zarr.citostring(ci, 3, '/') == "c/0" + @test Zarr.citostring(dot_noprefix, ci) == "0" + @test Zarr.citostring(dot_prefix, ci) == "c.0" + @test Zarr.citostring(slash_noprefix, ci) == "0" + @test Zarr.citostring(slash_prefix, ci) == "c/0" end let ci = CartesianIndex(1,1,1) - @test Zarr.citostring(ci, 2, '.') == "0.0.0" - @test Zarr.citostring(ci, 2, '/') == "0/0/0" - @test Zarr.citostring(ci, 3, v2cke_period) == "0.0.0" - @test Zarr.citostring(ci, 3, v2cke_slash) == "0/0/0" - @test Zarr.citostring(ci, 3, '.') == "c.0.0.0" - @test Zarr.citostring(ci, 3, '/') == "c/0/0/0" + @test Zarr.citostring(dot_noprefix, ci) == "0.0.0" + @test Zarr.citostring(dot_prefix, ci) == "c.0.0.0" + @test Zarr.citostring(slash_noprefix, ci) == "0/0/0" + @test Zarr.citostring(slash_prefix, ci) == "c/0/0/0" end let ci = CartesianIndex(1,3,5) - @test Zarr.citostring(ci, 2, '.') == "4.2.0" - @test Zarr.citostring(ci, 2, '/') == "4/2/0" - @test Zarr.citostring(ci, 3, v2cke_period) == "4.2.0" - @test Zarr.citostring(ci, 3, v2cke_slash) == "4/2/0" - @test Zarr.citostring(ci, 3, '.') == "c.4.2.0" - @test Zarr.citostring(ci, 3, '/') == "c/4/2/0" + @test Zarr.citostring(dot_noprefix, ci) == "4.2.0" + @test Zarr.citostring(dot_prefix, ci) == "c.4.2.0" + @test Zarr.citostring(slash_noprefix, ci) == "4/2/0" + @test Zarr.citostring(slash_prefix, ci) == "c/4/2/0" end end @@ -47,42 +43,44 @@ end Function to test the interface of AbstractStore. Every complete implementation should pass this test. """ function test_store_common(ds::Zarr.AbstractStore) - @test !Zarr.is_zgroup(ds,"") + V = Zarr.DV + enc = Zarr.ChunkEncoding(Zarr.default_sep(V), Zarr.default_prefix(V)) + + @test !Zarr.is_zgroup(V, ds, "") ds[".zgroup"]=rand(UInt8,50) @test haskey(ds,".zgroup") - @test Zarr.is_zgroup(ds,"") - @test !Zarr.is_zarray(ds,"") + @test Zarr.is_zgroup(V, ds, "") + @test !Zarr.is_zarray(V, ds, "") @test isempty(Zarr.subdirs(ds,"")) @test sort(collect(Zarr.subkeys(ds,"")))==[".zgroup"] #Create a subgroup - @test !Zarr.is_zarray(ds,"bar") + @test !Zarr.is_zarray(V, ds, "bar") ds["bar/.zarray"] = rand(UInt8,50) - @test Zarr.is_zarray(ds,"bar") + @test Zarr.is_zarray(V, ds, "bar") @test Zarr.subdirs(ds,"") == ["bar"] @test Zarr.subdirs(ds,"bar") == String[] #Test getindex and setindex data = rand(UInt8,50) - V = Zarr.zarr_format(ds) - S = Zarr.dimension_separator(ds) - first_ci_str = Zarr.citostring(CartesianIndex(1,1,1), V, S) - second_ci_str = Zarr.citostring(CartesianIndex(2,1,1), V, S) + + first_ci_str = Zarr.citostring(enc, CartesianIndex(1, 1, 1)) + second_ci_str = Zarr.citostring(enc, CartesianIndex(2, 1, 1)) ds["bar/" * first_ci_str] = data @test ds["bar/0.0.0"]==data @test Zarr.storagesize(ds,"bar")==50 @test Zarr.isinitialized(ds,"bar/" * first_ci_str) @test !Zarr.isinitialized(ds,"bar/" * second_ci_str) - Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) - @test Zarr.getattrs(ds,"bar")==Dict("a"=>"b") + Zarr.writeattrs(V, ds, "bar", Dict("a" => "b")) + @test Zarr.getattrs(V, ds, "bar") == Dict("a" => "b") delete!(ds,"bar/" * first_ci_str) - @test !Zarr.isinitialized(ds,"bar",CartesianIndex((1,1,1))) + @test !Zarr.store_isinitialized(ds, "bar", CartesianIndex((1, 1, 1)), enc) @test !Zarr.isinitialized(ds,"bar/" * first_ci_str) ds["bar/" * first_ci_str] = data - @test !Zarr.isinitialized(ds, "bar", CartesianIndex(0,0,0)) - @test Zarr.isinitialized(ds, "bar", CartesianIndex(1,1,1)) + @test !Zarr.store_isinitialized(ds, "bar", CartesianIndex(0, 0, 0), enc) + @test Zarr.store_isinitialized(ds, "bar", CartesianIndex(1, 1, 1), enc) #Add tests for empty storage @test Zarr.isemptysub(ds,"ba") @test Zarr.isemptysub(ds,"ba/") @@ -98,9 +96,11 @@ Function to test the interface of a read only AbstractStore. Every complete impl `closer` is a function that gets called to close the read only store. """ function test_read_only_store_common(converter, closer=Returns(nothing)) + V = Zarr.DV + enc = Zarr.ChunkEncoding(Zarr.default_sep(V), Zarr.default_prefix(V)) ds = Zarr.DictStore() rs = converter(ds) - @test !Zarr.is_zgroup(rs,"") + @test !Zarr.is_zgroup(V, rs, "") closer(rs) ds[".zgroup"]=rand(UInt8,50) @@ -108,20 +108,20 @@ function test_read_only_store_common(converter, closer=Returns(nothing)) @test haskey(rs,".zgroup") - @test Zarr.is_zgroup(rs,"") - @test !Zarr.is_zarray(rs,"") + @test Zarr.is_zgroup(V, rs, "") + @test !Zarr.is_zarray(V, rs, "") @test isempty(Zarr.subdirs(rs,"")) @test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"] #Create a subgroup - @test !Zarr.is_zarray(rs,"bar") + @test !Zarr.is_zarray(V, rs, "bar") closer(rs) ds["bar/.zarray"] = rand(UInt8,50) rs = converter(ds) - @test Zarr.is_zarray(rs,"bar") + @test Zarr.is_zarray(V, rs, "bar") @test Zarr.subdirs(rs,"") == ["bar"] @test Zarr.subdirs(rs,"bar") == String[] #Test getindex and setindex @@ -137,16 +137,16 @@ function test_read_only_store_common(converter, closer=Returns(nothing)) @test !Zarr.isinitialized(rs,"bar/0.0.1") closer(rs) - Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) + Zarr.writeattrs(V, ds, "bar", Dict("a" => "b")) rs = converter(ds) - @test Zarr.getattrs(rs,"bar")==Dict("a"=>"b") + @test Zarr.getattrs(V, rs, "bar") == Dict("a" => "b") closer(rs) delete!(ds,"bar/0.0.0") rs = converter(ds) - @test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0))) + @test !Zarr.store_isinitialized(rs, "bar", CartesianIndex((0, 0, 0)), enc) @test !Zarr.isinitialized(rs,"bar/0.0.0") closer(rs) @@ -213,10 +213,11 @@ end end @testset "AWS S3 Storage" begin + V = Zarr.DV @info "Testing AWS S3 storage" AWSS3.AWS.global_aws_config(AWSS3.AWS.AWSConfig(creds=nothing, region="us-west-2")) S3, p = Zarr.storefromstring("s3://mur-sst/zarr-v1") - @test Zarr.is_zgroup(S3, p) + @test Zarr.is_zgroup(V, S3, p) @test storagesize(S3, p) == 10551 S3group = zopen(S3,path=p) S3Array = S3group["time"] @@ -276,9 +277,10 @@ end g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5),fill_value = -1) @async HTTP.serve(Zarr.zarr_req_handler(s,g.path,403),ip,port,server=server) - g3 = zopen("http://$ip:$port") - @test_throws "Received error code 403" g3["a"][:,:] - Zarr.missing_chunk_return_code!(g3.storage,403) + httpstore = Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"), "") + @test_throws "Received error code 403" zopen(httpstore) + Zarr.missing_chunk_return_code!(httpstore, 403) + g3 = zopen(httpstore) @test all(==(-1),g3["a"][:,:]) close(server) end From a44fc981c6ab038bd3859ecb60b7555a9d841565 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 28 Nov 2025 11:34:36 +0100 Subject: [PATCH 39/39] remove FormattedStore --- src/Storage/formattedstore.jl | 234 ---------------------------------- 1 file changed, 234 deletions(-) delete mode 100644 src/Storage/formattedstore.jl diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl deleted file mode 100644 index d908b340..00000000 --- a/src/Storage/formattedstore.jl +++ /dev/null @@ -1,234 +0,0 @@ -# Default Zarr version -const DV = 2 - -# Default Zarr separator - -# Default Zarr v2 separator -const DS2 = '.' -# Default Zarr v3 separator -const DS3 = '/' - -default_sep(version) = version == 2 ? DS2 : - version == 3 ? DS3 : - error("Unknown version: $version") -const DS = default_sep(DV) - -# # Chunk Key Encodings for Zarr v3 -# # A Char is the separator for the default chunk key encoding -# abstract type ChunkKeyEncoding end -struct V2ChunkKeyEncoding <: ChunkKeyEncoding - sep::Char -end - -struct V3ChunkKeyEncoding <: ChunkKeyEncoding - sep::Char -end - -# """ -# FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore - -# FormattedStore wraps an AbstractStore to indicate a specific Zarr format. -# The path of a chunk depends on the version and chunk key encoding. - -# # Type Parameters - -# - V: Zarr format version -# - CKE: Chunk key encoding or dimension separator. -# CKE could be a `Char` or a subtype of `ChunkKeyEncoding`. -# - STORE: Type of AbstractStore wrapped - -# # Chunk Path Formats - -# ## Zarr version 2 - -# ### '.' dimension separator (default) - -# Chunks are encoded as "1.2.3" - -# ### '/' dimension separator - -# Chunks are encoded as "1/2/3" - -# ## Zarr version 3 - -# ### '/' dimension separator (default) - -# Chunks are encoded as "c/1/2/3" - -# ### '.' dimension separator - -# Chunks are encoded as "c.1.2.3" - -# ### V2ChunkKeyEncoding{SEP} - -# See Zarr version 2 -# """ -# struct FormattedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore -# parent::STORE -# end -# FormattedStore(args...) = FormattedStore{DV,DS}(args...) -# FormattedStore(s::FormattedStore) = s -# FormattedStore{V}(args...) where V = FormattedStore{V, default_sep(V)}(args...) -# FormattedStore{V}(s::FormattedStore{<:Any,S}) where {V,S} = FormattedStore{V, S}(s) -# FormattedStore{<: Any, S}(args...) where S = FormattedStore{DV, S}(args...) -# FormattedStore{<: Any, S}(s::FormattedStore{V}) where {V,S} = FormattedStore{V, S}(s) -# function FormattedStore{V,S}(store::AbstractStore) where {V,S} -# return FormattedStore{V,S,typeof(store)}(store) -# end -# function FormattedStore{V,S}(store::FormattedStore) where {V,S} -# p = parent(store) -# return FormattedStore{V,S,typeof(p)}(p) -# end - -# Base.parent(store::FormattedStore) = store.parent - -@inline citostring(i::CartesianIndex, cke::V3ChunkKeyEncoding) = "c$(cke.sep)" * join(reverse((i - oneunit(i)).I), cke.sep) -@inline citostring(::CartesianIndex{0}, cke::V3ChunkKeyEncoding) = "c$(cke.sep)0" -@inline citostring(i::CartesianIndex, cke::V2ChunkKeyEncoding) = join(reverse((i - oneunit(i)).I), cke.sep) -@inline citostring(::CartesianIndex{0}, cke::V2ChunkKeyEncoding) = "0" - -Base.getindex(s::FormattedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] -Base.delete!(s::FormattedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) -Base.setindex!(s::FormattedStore, v, p, i::CartesianIndex) = s[p, citostring(i,s)]=v - -isinitialized(s::FormattedStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) - -""" -- [`storagesize(d::AbstractStore, p::AbstractString)`](@ref storagesize) -- [`subdirs(d::AbstractStore, p::AbstractString)`](@ref subdirs) -- [`subkeys(d::AbstractStore, p::AbstractString)`](@ref subkeys) -- [`isinitialized(d::AbstractStore, p::AbstractString)`](@ref isinitialized) -- [`storefromstring(::Type{<: AbstractStore}, s, _)`](@ref storefromstring) -- `Base.getindex(d::AbstractStore, i::AbstractString)`: return the data stored in key `i` as a Vector{UInt8} -- `Base.setindex!(d::AbstractStore, v, i::AbstractString)`: write the values in `v` to the key `i` of the given store `d` -""" - -storagesize(d::FormattedStore, p::AbstractString) = storagesize(parent(d), p) -subdirs(d::FormattedStore, p::AbstractString) = subdirs(parent(d), p) -subkeys(d::FormattedStore, p::AbstractString) = subkeys(parent(d), p) -isinitialized(d::FormattedStore, p::AbstractString) = isinitialized(parent(d), p) -storefromstring(::Type{FormattedStore{<: Any, <: Any, STORE}}, s, _) where STORE = FormattedStore{DV,DS}(storefromstring(STORE, s)) -storefromstring(::Type{FormattedStore{V,S}}, s, _) where {V,S} = FormattedStore{DV,DS}(storefromstring(s)) -storefromstring(::Type{FormattedStore{V,S,STORE}}, s, _) where {V,S,STORE} = FormattedStore{V,S,STORE}(storefromstring(STORE, s)) -Base.getindex(d::FormattedStore, i::AbstractString) = getindex(parent(d), i) -Base.setindex!(d::FormattedStore, v, i::AbstractString) = setindex!(parent(d), v, i) -Base.delete!(d::FormattedStore, i::AbstractString) = delete!(parent(d), i) - - -function Base.getproperty(store::FormattedStore{V,S}, sym::Symbol) where {V,S} - if sym == :dimension_separator - return S - elseif sym == :zarr_format - return V - elseif sym ∈ propertynames(getfield(store, :parent)) - # Support forwarding of properties to parent - return getproperty(store.parent, sym) - else - getfield(store, sym) - end -end -function Base.propertynames(store::FormattedStore) - return (:dimension_separator, :zarr_format, fieldnames(typeof(store))..., propertynames(store.parent)...) -end - - -""" - Zarr.set_dimension_separator(store::FormattedStore{V}, sep::Char)::FormattedStore{V,sep} - -Returns a FormattedStore of the same type with the same `zarr_format` parameter, `V`, -but with a dimension separator of `sep`. Note that this does not mutate the original store. - -# Examples - -``` -julia> Zarr.set_dimension_separator(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), '/')) |> typeof -Zarr.FormattedStore{2, '/',Zarr.DictStore} -``` - -""" -function set_dimension_separator(store::FormattedStore{V}, sep::Char) where V - return FormattedStore{V,sep}(store) -end -function set_dimension_separator(store::AbstractStore, sep::Char) - return FormattedStore{<: Any,sep}(store) -end - -""" - set_zarr_format(::FormattedStore{<: Any, S}, zarr_format::Int)::FormattedStore{zarr_format,S} - -Returns a FormattedStore of the same type with the same `dimension_separator` parameter, `S`, -but with the specified `zarr_format` parameter. Note that this does not mutate the original store. - -# Examples - -``` -julia> Zarr.set_zarr_format(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), 3)) |> typeof -Zarr.FormattedStore{3, '.', DictStore} -``` - -""" -function set_zarr_format(store::FormattedStore{<: Any, S}, zarr_format::Int) where S - return FormattedStore{zarr_format,S}(store) -end -function set_zarr_format(store::AbstractStore, zarr_format::Int) - return FormattedStore{zarr_format}(store) -end - -dimension_separator(::AbstractStore) = DS -dimension_separator(::FormattedStore{<: Any,S}) where S = S -zarr_format(::AbstractStore) = DV -zarr_format(::FormattedStore{V}) where V = V - -is_zgroup(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = - isinitialized(s,_concatpath(p,"zarr.json")) && - metadata.node_type == "group" -is_zarray(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = - isinitialized(s,_concatpath(p,"zarr.json")) && - metadata.node_type == "array" - -getmetadata(s::FormattedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) -function writemetadata(s::FormattedStore{3}, p, m::AbstractMetadata; indent_json::Bool= false) - met = IOBuffer() - - if indent_json - JSON.print(met,m,4) - else - JSON.print(met,m) - end - - s[p,"zarr.json"] = take!(met) - m -end - -function getattrs(s::FormattedStore{3}) - md = s[p,"zarr.json"] - if md === nothing - error("zarr.json not found") - else - md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) - return get(md, "attributes", Dict{String, Any}()) - end -end - -function writeattrs(s::FormattedStore{3}, p, att::Dict; indent_json::Bool= false) - # This is messy, we need to open zarr.json and replace the attributes section - md = s[p,"zarr.json"] - if md === nothing - error("zarr.json not found") - else - md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) - end - md = Dict(md) - md["attributes"] = att - - b = IOBuffer() - - if indent_json - JSON.print(b,md,4) - else - JSON.print(b,md) - end - - s[p,"zarr.json"] = take!(b) - att -end