fill value and write support

Alexander-Barth · Alexander-Barth · commit f8a6e0dfe502 · 2024-02-18T22:34:23.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.1.0"
 CommonDataModel = "1fbeeb36-5f17-413c-809b-666fb144f157"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DiskArrays = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Zarr = "0a941bbe-ad1d-11e8-39d9-ab76183a1d99"
 
 [compat]
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -8,6 +8,7 @@ Modules = [ZarrDatasets]
 
 ### Differences between Zarr and NetCDF files
 
-* All metadata is stored in JSON files for Zarr with the following implications:
+* All metadata (in particular attributes) is stored in JSON files for the Zarr format with the following implications:
    * JSON does not distinguish between integers and real numbers. They are all considered as generic numbers. Whole numbers are loaded as `Int64` and decimal numbers `Float64`. It is not possible to store the number `1.0` as a real number.
    * The order of keys in a JSON document is undefined. It is therefore not possible to have a consistent ordering of the attributes or variables.
+   * The JSON standard does not allow NaN, +Inf, -Inf (https://github.com/capnproto/capnproto/issues/261).
diff --git a/src/ZarrDatasets.jl b/src/ZarrDatasets.jl
@@ -13,6 +13,9 @@ import CommonDataModel:
     attrib,
     attribnames,
     dataset,
+    defAttrib,
+    defVar,
+    defDim,
     dim,
     dimnames,
     iswritable,
@@ -29,6 +32,7 @@ import DiskArrays:
 import CommonDataModel as CDM
 using DataStructures
 using Zarr
+import JSON
 
 include("types.jl")
 include("dataset.jl")
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -8,13 +8,7 @@ function CDM.variable(ds::ZarrDataset,varname::SymbolOrString)
     ZarrVariable{eltype(zarray),ndims(zarray),typeof(zarray),typeof(ds)}(zarray,ds)
 end
 
-CDM.dimnames(ds::ZarrDataset) = Tuple(
-    sort(
-        unique(
-            reduce(vcat,
-                   (collect(dimnames(variable(ds,vn))) for vn in keys(ds)),
-                   init = String[]
-                   ))))
+CDM.dimnames(ds::ZarrDataset) = Tuple(String.(keys(ds.dimensions)))
 
 # function CDM.unlimited(ds::ZarrDataset)
 #     ul = ds.unlimited
@@ -33,31 +27,35 @@ CDM.dimnames(ds::ZarrDataset) = Tuple(
 #     return nothing
 # end
 
-function CDM.dim(ds::ZarrDataset,dimname::SymbolOrString)
+CDM.dim(ds::ZarrDataset,dimname::SymbolOrString) = ds.dimensions[Symbol(dimname)]
 
-    for vn in keys(ds)
-        v = variable(ds,vn)
-        dn = dimnames(v)
-        i = findfirst(==(dimname),dn)
-        if !isnothing(i)
-            return size(v,i)
-        end
-    end
-    error("dimension $dimname not found")
+function CDM.defDim(ds::ZarrDataset,dimname::SymbolOrString,dimlen)
+    dn = Symbol(dimname)
+    @assert !haskey(ds.dimensions,dn)
+    ds.dimensions[dn] = dimlen
 end
 
 CDM.varnames(ds::ZarrDataset) = keys(ds.zgroup.arrays)
 
 CDM.attribnames(ds::ZarrDataset) = keys(ds.zgroup.attrs)
 CDM.attrib(ds::ZarrDataset,name::SymbolOrString) = ds.zgroup.attrs[String(name)]
 
+function CDM.defAttrib(ds::ZarrDataset,name::SymbolOrString,value)
+    @assert iswritable(ds)
+    ds.zgroup.attrs[String(name)] = value
+
+    storage = ds.zgroup.storage
+    io = IOBuffer()
+    JSON.print(io, ds.zgroup.attrs)
+    storage[ds.zgroup.path,".zattrs"] = take!(io)
+end
 
 CDM.groupnames(ds::ZarrDataset) = keys(ds.zgroup.groups)
 CDM.group(ds::ZarrDataset,name::SymbolOrString) = ZarrDataset(ds.zgroup.groups,String(name),ds)
 
 
 CDM.parentdataset(ds::ZarrDataset) = ds.parentdataset
-CDM.iswritable(ds::ZarrDataset) = false
+CDM.iswritable(ds::ZarrDataset) = ds.iswritable
 CDM.maskingvalue(ds::ZarrDataset) = ds.maskingvalue
 
 
@@ -108,12 +106,38 @@ end # implicit call to close(ds)
 function ZarrDataset(url::AbstractString,mode = "r";
                      parentdataset = nothing,
                      _omitcode = 404,
-                     maskingvalue = missing)
-    ds = Zarr.zopen(url,mode)
-    if ds.storage isa Zarr.HTTPStore
-        Zarr.missing_chunk_return_code!(ds.storage,_omitcode)
+                     maskingvalue = missing,
+                     attrib = Dict(),
+                     )
+
+    dimensions = OrderedDict{Symbol,Int}()
+    iswritable = false
+
+    if mode == "r"
+        zg = Zarr.zopen(url,mode)
+        if (zg.storage isa Zarr.HTTPStore) ||
+            (zg.storage isa Zarr.ConsolidatedStore{Zarr.HTTPStore})
+            @debug "omit chunks on HTTP error" _omitcode
+            Zarr.missing_chunk_return_code!(zg.storage,_omitcode)
+        end
+
+        for (varname,zarray) in zg.arrays
+            for (dimname,dimlen) in zip(reverse(zarray.attrs["_ARRAY_DIMENSIONS"]),size(zarray))
+
+                dn = Symbol(dimname)
+                if haskey(dimensions,dn)
+                    @assert dimensions[dn] == dimlen
+                else
+                    dimensions[dn] = dimlen
+                end
+            end
+        end
+    elseif mode == "c"
+        store = Zarr.DirectoryStore(url)
+        zg = zgroup(store, "",attrs = Dict(attrib))
+        iswritable = true
     end
-    ZarrDataset(ds,parentdataset,maskingvalue)
+    ZarrDataset(zg,parentdataset,dimensions,iswritable,maskingvalue)
 end
 
 
diff --git a/src/types.jl b/src/types.jl
@@ -7,5 +7,7 @@ end
 struct ZarrDataset{TZ,TP,Tmaskingvalue} <: CDM.AbstractDataset
     zgroup::TZ
     parentdataset::TP
+    dimensions::OrderedDict{Symbol,Int}
+    iswritable::Bool
     maskingvalue::Tmaskingvalue
 end
diff --git a/src/variable.jl b/src/variable.jl
@@ -10,12 +10,61 @@ CDM.name(v::ZarrVariable) = Zarr.zname(v.zarray)
 CDM.dimnames(v::ZarrVariable) = Tuple(reverse(v.zarray.attrs["_ARRAY_DIMENSIONS"]))
 CDM.dataset(v::ZarrVariable) = v.parentdataset
 
-CDM.attribnames(v::ZarrVariable) = filter(!=("_ARRAY_DIMENSIONS"),keys(v.zarray.attrs))
-CDM.attrib(v::ZarrVariable,name::SymbolOrString) = v.zarray.attrs[String(name)]
+function CDM.attribnames(v::ZarrVariable)
+    names = filter(!=("_ARRAY_DIMENSIONS"),keys(v.zarray.attrs))
+    if !isnothing(v.zarray.metadata.fill_value)
+        push!(names,"_FillValue")
+    end
+    return names
+end
+
+function CDM.attrib(v::ZarrVariable,name::SymbolOrString)
+    if String(name) == "_FillValue" && !isnothing(v.zarray.metadata.fill_value)
+        return v.zarray.metadata.fill_value
+    end
+    return v.zarray.attrs[String(name)]
+end
+
+function CDM.defAttrib(v::ZarrVariable,name::SymbolOrString,value)
+    @assert iswritable(dataset(v))
+    @assert String(name) !== "_FillValue"
+
+    v.zarray.attrs[String(name)] = value
+
+    storage = v.zarray.storage
+    io = IOBuffer()
+    JSON.print(io, v.zarray.attrs)
+    storage[v.zarray.path,".zattrs"] = take!(io)
+end
 
 
 # DiskArray methods
 eachchunk(v::ZarrVariable) = eachchunk(v.zarray)
 haschunks(v::ZarrVariable) = haschunks(v.zarray)
 eachchunk(v::CFVariable{T,N,<:ZarrVariable}) where {T,N} = eachchunk(v.var)
 haschunks(v::CFVariable{T,N,<:ZarrVariable}) where {T,N} = haschunks(v.var)
+
+
+function CDM.defVar(ds::ZarrDataset,name::SymbolOrString,vtype::DataType,dimensionnames; chunksizes=nothing, attrib = Dict(), kwargs...)
+    @assert iswritable(ds)
+
+    _attrib = Dict(attrib)
+    _attrib["_ARRAY_DIMENSIONS"] = reverse(dimensionnames)
+
+    _size = ntuple(length(dimensionnames)) do i
+        ds.dimensions[Symbol(dimensionnames[i])]
+    end
+
+    if isnothing(chunksizes)
+        chunksizes = _size
+    end
+    zarray = zcreate(
+        vtype, ds.zgroup, name, _size...;
+        chunks = chunksizes,
+        attrs = _attrib,
+        kwargs...
+    )
+
+    return ZarrVariable{vtype,ndims(zarray),typeof(zarray),typeof(ds)}(
+        zarray,ds)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,4 +4,5 @@ using ZarrDatasets
 @testset "ZarrDatasets.jl" begin
     include("test_cdm.jl")
     include("test_multifile.jl")
+    include("test_write.jl")
 end
diff --git a/test/test_cdm.jl b/test/test_cdm.jl
@@ -1,4 +1,9 @@
-using CommonDataModel: iswritable, attribnames, parentdataset, load!, dataset
+using CommonDataModel:
+    attribnames,
+    dataset,
+    iswritable,
+    load!,
+    parentdataset
 using Dates
 using DiskArrays
 using NCDatasets
diff --git a/test/test_write.jl b/test/test_write.jl
@@ -0,0 +1,49 @@
+using ZarrDatasets
+using ZarrDatasets:
+    defDim,
+    defVar,
+    defAttrib
+using Zarr
+using DataStructures
+
+data = rand(Int32,3,5)
+
+fname = tempname()
+mkdir(fname)
+gattrib = Dict{String,Any}("title" => "this is the title")
+ds = ZarrDataset(fname,"c",attrib = gattrib)
+
+defDim(ds,"lon",3)
+defDim(ds,"lat",5)
+
+attrib = Dict{String,Any}(
+    "units" => "m/s",
+    "long_name" => "test",
+)
+
+
+varname = "var2"
+dimensionnames = ("lon","lat")
+vtype = Int32
+
+zv = defVar(ds,varname,vtype,dimensionnames, attrib = attrib)
+zv[:,:] = data
+zv.attrib["lala"] = 12
+zv.attrib["standard_name"] = "test"
+ds.attrib["history"] = "test"
+close(ds)
+
+ds = ZarrDataset(fname)
+
+zv = ds[varname]
+
+@test zv.attrib["lala"] == 12
+@test zv.attrib["standard_name"] == "test"
+@test ds.attrib["history"] == "test"
+
+@test zv[:,:] == data
+
+io = IOBuffer()
+show(io,ds)
+str = String(take!(io))
+@test occursin("Global",str)