diff --git a/.github/workflows/IntegrationTest.yml b/.github/workflows/IntegrationTest.yml index b1a4b4e..631f300 100644 --- a/.github/workflows/IntegrationTest.yml +++ b/.github/workflows/IntegrationTest.yml @@ -23,6 +23,7 @@ jobs: - {user: JuliaGeo, repo: GRIBDatasets.jl} - {user: Alexander-Barth, repo: TIFFDatasets.jl} - {user: JuliaGeo, repo: ZarrDatasets.jl} + - {user: eumetsat, repo: MetopDatasets.jl} steps: - uses: actions/checkout@v4 diff --git a/Project.toml b/Project.toml index bf8777e..01f8e85 100644 --- a/Project.toml +++ b/Project.toml @@ -4,12 +4,13 @@ keywords = ["netcdf", "GRIB", "climate and forecast conventions", "oceanography" license = "MIT" desc = "CommonDataModel is a module that defines types common to NetCDF and GRIB data" authors = ["Alexander Barth "] -version = "0.3.9" +version = "0.4.0" [deps] CFTime = "179af706-886a-5703-950a-314cd64e0468" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +DiskArrays = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -18,6 +19,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" CFTime = "0.1.1, 0.2" DataStructures = "0.17, 0.18" Dates = "1" +DiskArrays = "0.4.15" Preferences = "1.3" Printf = "1" Statistics = "1" diff --git a/README.md b/README.md index 0356358..481c51e 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ This package contains abstracts type definition for loading and manipulating GRI Features include: -* query and edit metadata of arrays and datasets +* query and edit metadata of arrays and datasets * virtually concatenating multiple files along a given dimension and merging virtually different datasets * create a virtual subset (`view`) by indices or by values of coordinate variables (`CommonDataModel.select`, `CommonDataModel.@select`) * group, map and reduce a variable (`CommonDataModel.groupby`, `CommonDataModel.@groupby`) and rolling reductions like running means `CommonDataModel.rolling`) diff --git a/docs/Project.toml b/docs/Project.toml index 891c5c6..0e4b913 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -4,12 +4,17 @@ CommonDataModel = "1fbeeb36-5f17-413c-809b-666fb144f157" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] +CairoMakie = "0.10" +Dates = "1" +Documenter = "1" +Downloads = "1" +IntervalSets = "0.7" +Literate = "2" NCDatasets = "0.14" -GLMakie = "0.8" +Statistics = "1" diff --git a/docs/make.jl b/docs/make.jl index 19114a3..ad10f4b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,3 +1,6 @@ +using Pkg +Pkg.activate(@__DIR__) + using Documenter: Documenter, makedocs, deploydocs using CommonDataModel using Literate @@ -6,7 +9,7 @@ Literate.markdown( "docs/src/tutorial1.jl","docs/src", execute = true, documenter = true, - # We add the credit to Literate.jl the footer + # We add the credit to Literate.jl in the footer credit = false, ) @@ -32,6 +35,6 @@ makedocs(; ], ) -deploydocs(; +deploydocs( repo="github.com/JuliaGeo/CommonDataModel.jl", ) diff --git a/src/CommonDataModel.jl b/src/CommonDataModel.jl index 61a01ad..1c59baf 100644 --- a/src/CommonDataModel.jl +++ b/src/CommonDataModel.jl @@ -7,6 +7,20 @@ using Dates using Printf using Preferences using DataStructures +import DiskArrays: + AbstractDiskArray, + AbstractSubDiskArray, + subarray, + writeblock!, + readblock!, + ChunkStyle, + haschunks, + eachchunk, + Unchunked, + Chunked, + GridChunks + +import DiskArrays import Base: LogicalIndex, checkbounds, diff --git a/src/attribute.jl b/src/attribute.jl index b7d025e..456edbc 100644 --- a/src/attribute.jl +++ b/src/attribute.jl @@ -35,7 +35,7 @@ function delAttrib(ds::Union{AbstractDataset,AbstractVariable},name::SymbolOrStr end -attribs(ds::Union{AbstractDataset,AbstractVariable}) = +attribs(ds::Union{AbstractDataset,AbstractVariable, SubVariable}) = OrderedDict((dn,attrib(ds,dn)) for dn in attribnames(ds)) diff --git a/src/cfconventions.jl b/src/cfconventions.jl index 7423af6..7c24dfc 100644 --- a/src/cfconventions.jl +++ b/src/cfconventions.jl @@ -17,7 +17,7 @@ variable `ncv` with the standard name modifier `modifier`. It can be used for example to access related variable like status flags. """ -function ancillaryvariables(ncv::CFVariable,modifier) +function ancillaryvariables(ncv::Union{CFVariable, SubVariable{<:Any,<:Any,<:CFVariable}},modifier) ds = dataset(ncv) varname = name(ncv) @@ -44,7 +44,7 @@ allowmissing(x::AbstractArray{T}) where {T} = convert(AbstractArray{Union{T, Mis -function _filter(ncv::AbstractVariable, indices...; accepted_status_flags = nothing) +function _filter(ncv::Union{AbstractVariable,SubVariable}, indices...; accepted_status_flags = nothing) data = allowmissing(ncv[indices...]) if (accepted_status_flags != nothing) @@ -99,13 +99,13 @@ good_data = NCDatasets.filter(ds["data"],:,:, accepted_status_flags = ["good_dat ``` """ -filter(ncv::AbstractVariable, indices::TIndices...; kwargs...) = +filter(ncv::Union{AbstractVariable,SubVariable}, indices::TIndices...; kwargs...) = _filter(ncv, indices...; kwargs...) -filter(ncv::AbstractVariable, indices::Union{Vector{<:Integer}, Array{<:CartesianIndex}}...; kwargs...) = +filter(ncv::Union{AbstractVariable,SubVariable}, indices::Union{Vector{<:Integer}, Array{<:CartesianIndex}}...; kwargs...) = _filter(ncv, indices...; kwargs...) -filter(ncv::AbstractVariable, indices::BitVector; kwargs...) = +filter(ncv::Union{AbstractVariable,SubVariable}, indices::BitVector; kwargs...) = _filter(ncv, indices...; kwargs...) """ @@ -127,7 +127,7 @@ v = ncv[:] close(ds) ``` """ -function coord(v::AbstractVariable,standard_name) +function coord(v::Union{AbstractVariable,SubVariable},standard_name) matches = Dict( "time" => [r".*since.*"], # It is great to have choice! diff --git a/src/cfvariable.jl b/src/cfvariable.jl index 7e30d1b..86c55ca 100644 --- a/src/cfvariable.jl +++ b/src/cfvariable.jl @@ -84,7 +84,7 @@ ds = NCDataset("foo.nc"); close(ds) ``` """ -function cfvariable(ds, +function cfvariable(ds, varname; _v = variable(ds,varname), attrib = _v.attrib, @@ -441,6 +441,19 @@ end return CFinvtransform(data,fv,inv_scale_factor,minus_offset,time_origin,inv_time_factor,maskingvalue,DT) end +## Define for DiskArrays +@inline function CFinvtransformdata(data::AbstractDiskArray{T,N},fv,scale_factor,add_offset,time_origin,time_factor,maskingvalue,DT) where {T,N} + data_materialized = Array(data) + return CFinvtransformdata(data_materialized,fv,scale_factor,add_offset,time_origin,time_factor,maskingvalue,DT) +end + +@inline function CFinvtransformdata( + data::AbstractDiskArray{T,N},fv::Tuple{},scale_factor::Nothing, + add_offset::Nothing,time_origin::Nothing,time_factor::Nothing,maskingvalue,::Type{T}) where {T,N} + # no transformation necessary (avoid allocation) + return data +end + # this function is necessary to avoid "iterating" over a single character in Julia 1.0 (fixed Julia 1.3) @@ -448,49 +461,63 @@ end #@inline CFtransformdata(data::Char,fv,scale_factor,add_offset,time_origin,time_factor,DTcast) = CFtransform_missing(data,fv) #@inline CFinvtransformdata(data::Char,fv,scale_factor,add_offset,time_origin,time_factor,DT) = CFtransform_replace_missing(data,fv) -function Base.getindex(v::CFVariable, indexes::TIndices...) - data = parent(v)[indexes...] - return CFtransformdata(data,fill_and_missing_values(v),scale_factor(v),add_offset(v), - time_origin(v),time_factor(v),maskingvalue(v),eltype(v)) +function DiskArrays.readblock!(v::CFVariable{T, N}, + aout, + indexes::Vararg{OrdinalRange, N}) where {T, N} + + parent_var = parent(v) + data = similar(aout, eltype(parent_var)) + DiskArrays.readblock!(parent_var, data, indexes...) + + aout .= CFtransformdata(data,fill_and_missing_values(v),scale_factor(v),add_offset(v), + time_origin(v),time_factor(v),maskingvalue(v),eltype(v)) + + + return nothing end -function Base.setindex!(v::CFVariable,data::Array{Missing,N},indexes::TIndices...) where N - parent(v)[indexes...] = fill(fillvalue(v),size(data)) + +function DiskArrays.writeblock!(v::CFVariable{T, N}, data::Array{Missing,N}, indexes::Vararg{OrdinalRange, N}) where {T, N} + parent(v)[indexes...] .= fillvalue(v) end -function Base.setindex!(v::CFVariable,data::Missing,indexes::TIndices...) - parent(v)[indexes...] = fillvalue(v) +function DiskArrays.writeblock!(v::CFVariable{T, N}, data::Missing, indexes::Vararg{OrdinalRange, N}) where {T, N} + parent(v)[indexes...] .= fillvalue(v) end -function Base.setindex!(v::CFVariable,data::Union{T,Array{T}},indexes::TIndices...) where T <: Union{AbstractCFDateTime,DateTime,Missing} +function DiskArrays.writeblock!(v::CFVariable{T, N}, data::Union{DT,Array{DT}}, indexes::Vararg{OrdinalRange, N}) where {T, N, DT <: Union{AbstractCFDateTime,DateTime,Missing}} if calendar(v) !== nothing # can throw an convertion error if calendar attribute already exists and # is incompatible with the provided data - parent(v)[indexes...] = CFinvtransformdata( + data_transformed = CFinvtransformdata( data,fill_and_missing_values(v),scale_factor(v),add_offset(v), time_origin(v),time_factor(v), maskingvalue(v), eltype(parent(v))) + + DiskArrays.writeblock!(parent(v), data_transformed, indexes...) + return data end @error "Time units and calendar must be defined during defVar and cannot change" end +function DiskArrays.writeblock!(v::CFVariable{T,N}, data, indexes::Vararg{OrdinalRange, N}) where {T, N} -function Base.setindex!(v::CFVariable,data,indexes::TIndices...) - parent(v)[indexes...] = CFinvtransformdata( - data,fill_and_missing_values(v), - scale_factor(v),add_offset(v), - time_origin(v),time_factor(v), - maskingvalue(v), - eltype(parent(v))) + data_transformed = CFinvtransformdata( + data,fill_and_missing_values(v), + scale_factor(v),add_offset(v), + time_origin(v),time_factor(v), + maskingvalue(v), + eltype(parent(v))) - return data + DiskArrays.writeblock!(parent(v), data_transformed, indexes...) end + # can be implemented overridden for faster implementation function boundsParentVar(ds,varname) for vn in varnames(ds) @@ -526,7 +553,7 @@ function _getattrib(ds,v,parentname,attribname,default) end end -function _isrelated(v1::AbstractVariable,v2::AbstractVariable) +function _isrelated(v1::Union{AbstractVariable,SubVariable},v2::Union{AbstractVariable,SubVariable}) dimnames(v1) ⊆ dimnames(v2) end diff --git a/src/dataset.jl b/src/dataset.jl index 973484f..ee4fe9e 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -171,7 +171,7 @@ function Base.getindex(ds::AbstractDataset,varname::SymbolOrString) end -function Base.setindex!(ds::AbstractDataset,data::AbstractVariable,varname::SymbolOrString) +function Base.setindex!(ds::AbstractDataset,data::Union{AbstractVariable, SubVariable},varname::SymbolOrString) return defVar(ds, varname, data) end @@ -202,7 +202,7 @@ julia> data = varbyattrib(ds, standard_name = "longitude")[1][:] ``` """ -function varbyattrib(ds::Union{AbstractDataset,AbstractVariable}; kwargs...) +function varbyattrib(ds::Union{AbstractDataset,AbstractVariable, SubVariable}; kwargs...) # Start with an empty list of variables varlist = [] @@ -385,6 +385,15 @@ end end end +@inline function Base.propertynames(ds::Union{AbstractDataset,AbstractVariable},private::Bool=false) + names = fieldnames(typeof(ds)) + + if ds isa AbstractDataset + return (names...,:attrib,:dim,:group) + else + return (names...,:attrib,:dim) + end +end for (item_color,default) in ( (:section_color, :red), diff --git a/src/defer.jl b/src/defer.jl index 9f42462..3f3cbed 100644 --- a/src/defer.jl +++ b/src/defer.jl @@ -111,9 +111,9 @@ variable(dds::DeferDataset,varname::Symbol) = variable(dds,string(varname)) dataset(dv::DeferVariable{T,N,TDS}) where {T,N,TDS} = DeferDataset(TDS,dv.r.filename,dv.r.mode; dv.r.args...) -function Base.getindex(dv::DeferVariable,indexes::Union{Int,Colon,AbstractRange{<:Integer}}...) +function DiskArrays.readblock!(dv::DeferVariable{T, N}, aout,indexes::Vararg{OrdinalRange, N}) where {T, N} Variable(dv) do v - return v[indexes...] + aout .= v[indexes...] end end diff --git a/src/groupby.jl b/src/groupby.jl index d34688b..92dab8d 100644 --- a/src/groupby.jl +++ b/src/groupby.jl @@ -279,7 +279,7 @@ function Base.broadcasted(::GroupedVariableStyle,f::Function,A::GroupedVariable{ A.v,A.coordname,A.group_fun,A.groupmap,A.dim,map_fun) end -function GroupedVariable(v::TV,coordname,group_fun::TF,groupmap,dim,map_fun::TM) where TV <: AbstractVariable where {TF,TM} +function GroupedVariable(v::TV,coordname,group_fun::TF,groupmap,dim,map_fun::TM) where TV <: Union{AbstractVariable,SubVariable} where {TF,TM} TGM = typeof(groupmap) #TG = Base.return_types(selectdim,(TV,Int,Int,))[1] @@ -361,7 +361,7 @@ close(ds) ``` """ -function groupby(v::AbstractVariable,(coordname,group_fun)::Pair{<:SymbolOrString,TF}) where TF +function groupby(v::Union{AbstractVariable,SubVariable},(coordname,group_fun)::Pair{<:SymbolOrString,TF}) where TF # for NCDatasets 0.12 c = v[String(coordname)][:] class = group_fun.(c) @@ -462,6 +462,9 @@ Base.BroadcastStyle(::Type{<:ReducedGroupedVariable}) = ReducedGroupedVariableSt Base.BroadcastStyle(::DefaultArrayStyle,::ReducedGroupedVariableStyle) = ReducedGroupedVariableStyle() Base.BroadcastStyle(::ReducedGroupedVariableStyle,::DefaultArrayStyle) = ReducedGroupedVariableStyle() +Base.BroadcastStyle(::DiskArrays.ChunkStyle,::ReducedGroupedVariableStyle) = ReducedGroupedVariableStyle() +Base.BroadcastStyle(::ReducedGroupedVariableStyle,::DiskArrays.ChunkStyle) = ReducedGroupedVariableStyle() + function Base.similar(bc::Broadcasted{ReducedGroupedVariableStyle}, ::Type{ElType}) where ElType # Scan the inputs for the ReducedGroupedVariable: A = find_gv(ReducedGroupedVariable,bc) @@ -587,3 +590,16 @@ function dataset(gr::ReducedGroupedVariable) gr.reduce_fun, ) end + +# ReducedGroupedVariable is a Variable and therefore an AbstractDiskArray. +# getindex is overloaded for ReducedGroupedVariable and therefore +# ReducedGroupedVariable is defined to use getindex. +# An alternative solution is to remove getindex definitions +# and then implement the read logic in readblock! +function DiskArrays.readblock!(gr::ReducedGroupedVariable{T,N}, + aout, + indexes::Vararg{OrdinalRange, N}) where {T,N} + + aout .= Base.getindex(gr,indexes...) + return aout +end diff --git a/src/memory_dataset.jl b/src/memory_dataset.jl index 65c39c2..fe3640d 100644 --- a/src/memory_dataset.jl +++ b/src/memory_dataset.jl @@ -70,10 +70,18 @@ end Base.parent(v::MemoryVariable) = v.data Base.size(v::MemoryVariable) = size(parent(v)) -Base.getindex(v::MemoryVariable,ij::TIndices...) = parent(v)[ij...] -function Base.setindex!(v::MemoryVariable,data,ij...) + +function DiskArrays.readblock!(v::MemoryVariable{T, N}, + aout, + indexes::Vararg{OrdinalRange, N}) where {T, N} + + aout .= parent(v)[indexes...] +end + + +function DiskArrays.writeblock!(v::MemoryVariable{T, N}, data, indexes::Vararg{OrdinalRange, N}) where {T, N} sz = size(v) - parent(v)[ij...] = data + parent(v)[indexes...] = data root = _root(v) for idim = findall(size(v) .> sz) @@ -83,6 +91,19 @@ function Base.setindex!(v::MemoryVariable,data,ij...) return data end +function DiskArrays.writeblock!(v::MemoryVariable{T, 0}, data) where {T} + sz = size(v) + parent(v)[] = data[] + + root = _root(v) + for idim = findall(size(v) .> sz) + dname = v.dimnames[idim] + grow_unlimited_dimension(v.parent_dataset,dname,size(v,idim)) + end + return data +end + + CDM.load!(v::MemoryVariable,buffer,ij...) = buffer .= view(parent(v),ij...) CDM.name(v::Union{MemoryVariable,MemoryDataset}) = v.name CDM.dimnames(v::MemoryVariable) = v.dimnames diff --git a/src/multifile.jl b/src/multifile.jl index 66b3f64..87ebb45 100644 --- a/src/multifile.jl +++ b/src/multifile.jl @@ -181,12 +181,33 @@ maskingvalue(mfds::MFDataset) = maskingvalue(mfds.ds[1]) Base.parent(v::MFVariable) = v.var Base.parent(v::MFCFVariable) = v.var Base.Array(v::MFVariable) = Array(parent(v)) -Base.getindex(v::MFVariable,indexes::TIndices...) = getindex(parent(v),indexes...) -Base.setindex!(v::MFVariable,data,indexes::TIndices...) = setindex!(parent(v),data,indexes...) + +function DiskArrays.readblock!(v::MFVariable{T, N}, aout,indexes::Vararg{OrdinalRange, N}) where {T, N} + aout .= parent(v)[indexes...] +end +function DiskArrays.readblock!(v::MFCFVariable{T, N}, aout, indexes::Vararg{OrdinalRange, N}) where {T, N} + aout .= v.cfvar[indexes...] +end + +function DiskArrays.writeblock!(v::MFVariable{T,N}, data, indexes::Vararg{OrdinalRange, N}) where {T, N} + parent(v)[indexes...] = data +end + +function DiskArrays.writeblock!(v::MFVariable{T,0}, data) where {T} + parent(v)[] = data[] +end + +function DiskArrays.writeblock!(v::MFCFVariable{T,N}, data, indexes::Vararg{OrdinalRange, N}) where {T, N} + v.cfvar[indexes...] = data +end + +function DiskArrays.writeblock!(v::MFCFVariable{T,0}, data) where {T} + v.cfvar[] = data[] +end + Base.size(v::MFVariable) = size(parent(v)) Base.size(v::MFCFVariable) = size(parent(v)) -Base.getindex(v::MFCFVariable,ind::TIndices...) = v.cfvar[ind...] -Base.setindex!(v::MFCFVariable,data,ind::TIndices...) = v.cfvar[ind...] = data + function Base.cat(vs::AbstractVariable...; dims::Integer) CatArrays.CatArray(dims,vs...) end @@ -288,10 +309,19 @@ function chunking(v::MFVariable) storage,chunksizes = chunking(v1) if storage == :contiguous - return (:chunked, size(v1)) - else - return storage,chunksizes + storage = :chunked + chunksizes = size(v1) + end + + if ndims(v) == (length(chunksizes)+1) + cat_dim = v.var.dim + chunksizes = (chunksizes[1:(cat_dim-1)]...,1,chunksizes[cat_dim:end]...) + end + + if ndims(v) != length(chunksizes) + return DimensionMismatch("Chunking length not matching MFVariable dimension") end + return storage, chunksizes end deflate(v::MFVariable) = deflate(v.ds.ds[1][name(v)]) diff --git a/src/select.jl b/src/select.jl index 0abe497..b95e935 100644 --- a/src/select.jl +++ b/src/select.jl @@ -359,7 +359,7 @@ function select(v,conditions...) end end -function coordinate_value(v::AbstractVariable,name_coord::Symbol) +function coordinate_value(v::Union{AbstractVariable,SubVariable},name_coord::Symbol) ncv = dataset(v)[name_coord] @assert ndims(ncv) == 1 dimension_name = dimnames(ncv)[1] @@ -373,7 +373,7 @@ function coordinate_value(v::AbstractVariable,name_coord::Symbol) end -function coordinate_names(v::AbstractVariable) +function coordinate_names(v::Union{AbstractVariable,SubVariable}) ds = dataset(v) dimension_names = dimnames(v) diff --git a/src/subvariable.jl b/src/subvariable.jl index 73f9461..d1ffa3f 100644 --- a/src/subvariable.jl +++ b/src/subvariable.jl @@ -1,11 +1,39 @@ -Base.parent(v::SubVariable) = v.parent -Base.parentindices(v::SubVariable) = v.indices -Base.size(v::SubVariable) = _shape_after_slice(size(parent(v)),v.indices...) +function Base.show(io::IO, v::SubVariable) + level = get(io, :level, 0) + indent = " " ^ get(io, :level, 0) + delim = " × " + try + indices = parentindices(v) + print(io,indent,"View: ",join(indices,delim),"\n") + show(IOContext(io,:level=>level+1),parent(v)) + catch err + @warn "error in show" err + print(io,"SubVariable (dataset closed)") + end +end + +Base.show(io::IO,::MIME"text/plain",v::SubVariable) = show(io,v) + +DiskArrays.subarray(v::SubVariable) = v.v + +function Base.getproperty(sub_var::SubVariable, name::Symbol) + if !hasfield(typeof(sub_var),name) + parent_var = parent(sub_var) + if name == :var + # if var also return a view + return view(parent_var.var, sub_indices(sub_var)...) + else + return Base.getproperty(parent_var, name) + end + else + return getfield(sub_var,name) # get field from sub_var + end +end function dimnames(v::SubVariable) dimension_names = dimnames(parent(v)) - return dimension_names[map(i -> !(i isa Integer),collect(v.indices))] + return dimension_names[map(i -> !(i isa Integer),collect(parentindices(v)))] end name(v::SubVariable) = name(parent(v)) @@ -13,61 +41,28 @@ name(v::SubVariable) = name(parent(v)) attribnames(v::SubVariable) = attribnames(parent(v)) attrib(v::SubVariable,name::SymbolOrString) = attrib(parent(v),name) defAttrib(v::SubVariable,name::SymbolOrString,data) = defAttrib(parent(v),name,data) - -function SubVariable(A::AbstractVariable,indices...) - var = nothing - if hasproperty(A,:var) - if hasmethod(SubVariable,Tuple{typeof(A.var),typeof.(indices)...}) - var = SubVariable(A.var,indices...) - end - end - - T = eltype(A) - N = length(size_getindex(A,indices...)) - return SubVariable{T,N,typeof(A),typeof(indices),typeof(A.attrib),typeof(var)}( - A,indices,A.attrib,var) +materialize(v::SubVariable) = parent(v)[sub_indices(v)] +sub_indices(v::SubVariable) = DiskArrays.subarray(v).indices + +function map_indices(parent_var::AbstractVariable, selected_var::AbstractVariable, + indices_subvariable) + + dims_selected = dimnames(selected_var) + dims_var= dimnames(parent_var) + dim_mapping = [findfirst( x-> x==d, dims_var) for d in dims_selected] + + indices_selected = indices_subvariable[dim_mapping] + return indices_selected end -SubVariable(A::AbstractVariable{T,N}) where T where N = SubVariable(A,ntuple(i -> :,N)...) - -# recursive calls so that the compiler can infer the types via inline-ing -# and constant propagation -_subsub(indices,i,l) = indices -_subsub(indices,i,l,ip,rest...) = _subsub((indices...,ip[i[l]]),i,l+1,rest...) -_subsub(indices,i,l,ip::Number,rest...) = _subsub((indices...,ip),i,l,rest...) -_subsub(indices,i,l,ip::Colon,rest...) = _subsub((indices...,i[l]),i,l+1,rest...) -#= - j = subsub(parentindices,indices) +## getting the related var also returns a SubVariable +function Base.getindex(sub_var::SubVariable,n::Union{CFStdName,SymbolOrString}) + parent_var = parent(sub_var) + selected_var = parent_var[n] -Computed the tuple of indices `j` so that -`A[parentindices...][indices...] = A[j...]` for any array `A` and any tuple of -valid indices `parentindices` and `indices` -=# -subsub(parentindices,indices) = _subsub((),indices,1,parentindices...) - -materialize(v::SubVariable) = parent(v)[v.indices...] - -""" -collect always returns an array. -Even if the result of the indexing is a scalar, it is wrapped -into a zero-dimensional array. -""" -function collect(v::SubVariable{T,N}) where T where N - if N == 0 - A = Array{T,0}(undef,()) - A[] = parent(v)[v.indices...] - return A - else - return parent(v)[v.indices...] - end -end - -Base.Array(v::SubVariable) = collect(v) - -function Base.view(v::SubVariable,indices::Union{<:Integer,Colon,AbstractVector{<:Integer}}...) - sub_indices = subsub(v.indices,indices) - SubVariable(parent(v),sub_indices...) + indices_selected = map_indices(parent_var, selected_var, sub_indices(sub_var)) + return view(selected_var, indices_selected...) end """ @@ -96,27 +91,17 @@ close(ds) ``` """ -Base.view(v::AbstractVariable,indices::Union{<:Integer,Colon,AbstractVector{<:Integer}}...) = SubVariable(v,indices...) -Base.view(v::SubVariable,indices::CartesianIndex) = view(v,indices.I...) -Base.view(v::SubVariable,indices::CartesianIndices) = view(v,indices.indices...) - -Base.getindex(v::SubVariable,indices::Union{Int,Colon,AbstractRange{<:Integer}}...) = materialize(view(v,indices...)) - -Base.getindex(v::SubVariable,indices::CartesianIndex) = getindex(v,indices.I...) -Base.getindex(v::SubVariable,indices::CartesianIndices) = - getindex(v,indices.indices...) - -function Base.setindex!(v::SubVariable,data,indices...) - sub_indices = subsub(v.indices,indices) - parent(v)[sub_indices...] = data +function Base.view(a::AbstractVariable,i...) + i2 = DiskArrays._replace_colon.(size(a), i) # TODO improve + return SubVariable(SubArray(a, i2)) end -Base.setindex!(v::SubVariable,data,indices::CartesianIndex) = - setindex!(v,data,indices.I...) -Base.setindex!(v::SubVariable,data,indices::CartesianIndices) = - setindex!(v,data,indices.indices...) - +# copied from https://github.com/JuliaIO/DiskArrays.jl/blob/6522ba32759f81945396890ebba5525d33342244/src/subarray.jl#L28 +# this is done to not depend on DiskArray Internals. +_replace_colon(s, ::Colon) = Base.OneTo(s) +_replace_colon(s, r) = r +Base.view(a::AbstractVariable, i::CartesianIndices) = view(a, i.indices...) dimnames(ds::SubDataset) = dimnames(ds.ds) defDim(ds::SubDataset,name::SymbolOrString,len) = defDim(ds.ds,name,len) @@ -176,7 +161,7 @@ groupname(ds::SubDataset) = groupname(ds.ds) function dataset(v::SubVariable) - indices = (;((Symbol(d),i) for (d,i) in zip(dimnames(parent(v)),v.indices))...) + indices = (;((Symbol(d),i) for (d,i) in zip(dimnames(parent(v)), sub_indices(v)))...) return SubDataset(dataset(parent(v)),indices) end diff --git a/src/types.jl b/src/types.jl index 7211d54..a381891 100644 --- a/src/types.jl +++ b/src/types.jl @@ -10,7 +10,7 @@ NetCDF or GRIB file) A data set `ds` of a type derived from `AbstractDataset` should implemented at minimum: -* `Base.key(ds)`: return a list of variable names as strings +* `varnames(ds)`: return a list of variable names as strings * `variable(ds,varname::String)`: return an array-like data structure (derived from `AbstractVariable`) of the variables corresponding to `varname`. This array-like data structure should follow the CF semantics. * `dimnames(ds)`: should be an iterable with all dimension names in the data set `ds` * `dim(ds,name)`: dimension value corresponding to name @@ -33,13 +33,14 @@ end """ -`AbstractVariable{T,N}` is a subclass of `AbstractArray{T, N}`. A variable `v` of a type derived from `AbstractVariable` should implement: +`AbstractVariable{T,N}` is a subclass of `DiskArrays.AbstractDiskArray{T,N}` which is a subclass of `AbstractArray{T, N}`. +A variable `v` of a type derived from `AbstractVariable` should implement: * `name(v)`: should be the name of variable within the data set * `dimnames(v)`: should be a iterable data structure with all dimension names of the variable `v` * `dataset(v)`: the parent dataset containing `v` * `Base.size(v)`: the size of the variable -* `Base.getindex(v,indices...)`: get the data of `v` at the provided indices +* `DiskArrays.readblock!(v, aout, indices...)`: get the data of `v` at the provided indices. See [DiskArrays.jl readme](https://github.com/JuliaIO/DiskArrays.jl) for more details. Optionally a variable can have attributes: @@ -48,10 +49,10 @@ Optionally a variable can have attributes: For a writable dataset, one should also implement: * `defAttrib`: define a attribute -* `Base.setindex!(v,data,indices...)`: set the data in `v` at the provided indices +* `DiskArrays.writeblock!(v,ain,indices...)`: set the `ain` in `v` at the provided indices. See [DiskArrays.jl readme](https://github.com/JuliaIO/DiskArrays.jl) for more details. """ -abstract type AbstractVariable{T,N} <: AbstractArray{T, N} +abstract type AbstractVariable{T,N} <: AbstractDiskArray{T, N} end @@ -155,12 +156,8 @@ end # view of subsets -struct SubVariable{T,N,TA,TI,TAttrib,TV} <: AbstractVariable{T,N} - parent::TA - indices::TI - attrib::TAttrib - # unpacked variable - var::TV +struct SubVariable{T,N,P,I,L} <: AbstractSubDiskArray{T,N,P,I,L} + v::SubArray{T,N,P,I,L} end struct SubDataset{TD,TI,TA,TG} <: AbstractDataset diff --git a/src/variable.jl b/src/variable.jl index 14b348f..2c0732f 100644 --- a/src/variable.jl +++ b/src/variable.jl @@ -203,7 +203,7 @@ Defines and return the variable in the data set `ds` copied from the variable `src`. The dimension name, attributes and data are copied from `src` as well as the variable name (unless provide by `name`). """ -function defVar(dest::AbstractDataset,varname::SymbolOrString,srcvar::AbstractVariable; kwargs...) +function defVar(dest::AbstractDataset,varname::SymbolOrString,srcvar::Union{AbstractVariable, SubVariable}; kwargs...) _ignore_checksum = false if haskey(kwargs,:checksum) _ignore_checksum = kwargs[:checksum] === nothing @@ -262,7 +262,7 @@ function defVar(dest::AbstractDataset,varname::SymbolOrString,srcvar::AbstractVa end -function defVar(dest::AbstractDataset,srcvar::AbstractVariable; kwargs...) +function defVar(dest::AbstractDataset,srcvar::Union{AbstractVariable,SubVariable}; kwargs...) defVar(dest,name(srcvar),srcvar; kwargs...) end @@ -310,6 +310,20 @@ function Base.show(io::IO,v::AbstractVariable) end +function DiskArrays.haschunks(v::AbstractVariable) + storage, chunksizes = chunking(v) + if storage == :contiguous + return DiskArrays.Unchunked() + else + return DiskArrays.Chunked() + end +end + +function DiskArrays.eachchunk(v::AbstractVariable) + storage,chunksizes = chunking(v) + + return DiskArrays.GridChunks(v, replace(chunksizes,0=>1)) +end chunking(v::AbstractVariable) = (:contiguous,size(v)) chunking(v::AbstractVariable,storage,chunksizes) = nothing diff --git a/test/Project.toml b/test/Project.toml index 8c42314..542a8ae 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,9 +3,8 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CFTime = "179af706-886a-5703-950a-314cd64e0468" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" -GRIBDatasets = "82be9cdb-ee19-4151-bdb3-b400788d9abc" +DiskArrays = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" -NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 60d65bb..6148122 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,10 @@ using Test include("test_copy.jl") end +@testset "DiskArray interface" begin + include("test_disk_array.jl") +end + @testset "CF conventions" begin include("test_cfconventions.jl") include("test_coord.jl") @@ -30,7 +34,7 @@ end include("test_multifile_select.jl") end -@testset "groupby" begin +@testset "groupby" begin include("test_groupby.jl") include("test_rolling.jl") end diff --git a/test/test_cfconventions.jl b/test/test_cfconventions.jl index 379e51b..95ef453 100644 --- a/test/test_cfconventions.jl +++ b/test/test_cfconventions.jl @@ -58,6 +58,12 @@ ds = TDS(fname,"r") @test_throws ErrorException filter(ds["DEPTH"],:,accepted_status_flags = ["good_data","probably_good_data"]) +# test also for views +lat_view = view(ds["LAT"],2:3) +@test name(ancillaryvariables(lat_view, "status_flag")) == "QC_LAT" +@test isequal(filter(lat_view,:,accepted_status_flags = ["good_data","probably_good_data"]), + [2.,missing]) + close(ds) # query by CF Standard Name @@ -122,11 +128,14 @@ height = ds[CF"height"] @test height[CF"longitude"][:] == 1:10 @test height[CF"latitude"][:] == 1:11 +height_sub = @view ds[CF"height"][2:3,2:3] +@test height_sub[:,:] == data[2:3,2:3] +@test height_sub[CF"longitude"][:] == 2:3 +@test height_sub[CF"latitude"][:] == 2:3 -height = @view ds[CF"height"][2:3,2:3] -@test height[:,:] == data[2:3,2:3] -@test height[CF"longitude"][:] == 2:3 -@test height[CF"latitude"][:] == 2:3 +@test size(height_sub) == (2,2) +@test size(height_sub[CF"longitude"]) == (2,) +@test size(height_sub[CF"latitude"]) == (2,) @test_throws KeyError ds[CF"temperature"] diff --git a/test/test_coord.jl b/test/test_coord.jl index 8c1b51d..0551a6c 100644 --- a/test/test_coord.jl +++ b/test/test_coord.jl @@ -6,7 +6,7 @@ fname = tempname() TDS = MemoryDataset for define_standard_name in [false,true] - + TDS(fname,"c") do ds # Dimensions @@ -72,5 +72,7 @@ for define_standard_name in [false,true] @test name(coord(ds["zeta"],"longitude")) == "lon_rho" @test name(coord(ds["ubar"],"longitude")) == "lon_u" @test coord(ds["ubar"],"foobar") == nothing + var_view = view(ds["zeta"],1:2,:,:) + @test name(coord(var_view,"longitude")) == "lon_rho" close(ds) end diff --git a/test/test_disk_array.jl b/test/test_disk_array.jl new file mode 100644 index 0000000..e131a48 --- /dev/null +++ b/test/test_disk_array.jl @@ -0,0 +1,78 @@ +using Test +import DiskArrays +using DataStructures +using CommonDataModel +using CommonDataModel: + MemoryDataset, defVar + + +@test CommonDataModel.AbstractVariable <: AbstractArray +@test CommonDataModel.CFVariable <: CommonDataModel.AbstractVariable +@test CommonDataModel.AbstractVariable <: DiskArrays.AbstractDiskArray +@test CommonDataModel.SubVariable <: DiskArrays.AbstractDiskArray + +# create test data +TDS = MemoryDataset +fname = tempname() + +ds = TDS(fname,"c", attrib = OrderedDict( + "title" => "title", +)); + +# Dimensions +ds.dim["lon"] = 10 +ds.dim["lat"] = 11 + +# Declare variables + +nclon = defVar(ds,"lon", Float64, ("lon",), attrib = OrderedDict( + "long_name" => "Longitude", + "standard_name" => "longitude", + "units" => "degrees_east", +)) + +nclat = defVar(ds,"lat", Float64, ("lat",), attrib = OrderedDict( + "long_name" => "Latitude", + "standard_name" => "latitude", + "units" => "degrees_north", +)) + +ncvar = defVar(ds,"bat", Float32, ("lon", "lat"), attrib = OrderedDict( + "long_name" => "elevation above sea level", + "standard_name" => "height", + "units" => "meters", + "_FillValue" => Float32(9.96921e36), +)) + +ncscalar = defVar(ds,"scalar", 12, ()) + +# Define variables + +data = rand(Float32,10,11) + +nclon[:] = 1:10 +nclat[:] = 1:11 +ncvar[:,:] = data + +# test broadcast on variable +@test nclon isa DiskArrays.AbstractDiskArray +in_lon = nclon .< 5 +@test in_lon isa DiskArrays.BroadcastDiskArray +@test ncvar[in_lon,:] == data[in_lon,:] + +# test view of variable +sub_var = view(ncvar,3:7,1:2:11) +@test sub_var isa DiskArrays.AbstractSubDiskArray +@test (sub_var .+ 2) isa DiskArrays.BroadcastDiskArray + +# test getting related variable of view +@test sub_var["lat"] isa DiskArrays.AbstractSubDiskArray +@test sub_var["lat"][:] == nclat[1:2:11] +@test sub_var["lon"][:] == nclat[3:7] + +# test write to view. +sub_var["lon"][1:2] .= 17 +@test all(nclon[3:4] .== 17) + +# test attrib of view +@test sub_var.attrib == ncvar.attrib diff --git a/test/test_scaling.jl b/test/test_scaling.jl index a19a899..c01499e 100644 --- a/test/test_scaling.jl +++ b/test/test_scaling.jl @@ -214,15 +214,18 @@ close(ds) add_offset = -1.0 scale_factor = 0.1 -attributes = Dict("add_offset" => add_offset,"scale_factor" => scale_factor) - - for Tbase in (UInt8, Int8, Float32, Float64, Int64, Char, String) for _maskingvalue in (NaN,NaN32,missing,42,nothing,mysingleton) - local fname, data, fv, ds, ncv, T, varattrib + local fname, data, fv, ds, ncv, T, varattrib, attributes fname = tempname() T = promote_type(Tbase,typeof(_maskingvalue)) + if T == Float32 + attributes = Dict("add_offset" => T(add_offset),"scale_factor" => T(scale_factor)) + else + attributes = Dict("add_offset" => add_offset,"scale_factor" => scale_factor) + end + data = Matrix{T}(undef,(3,4)) if Tbase == String @@ -238,7 +241,11 @@ for Tbase in (UInt8, Int8, Float32, Float64, Int64, Char, String) varattrib = if Tbase <: Number - attributes + if _maskingvalue isa Float32 + Dict("add_offset" => Float32(add_offset),"scale_factor" => Float32(scale_factor)) + else + Dict("add_offset" => Float64(add_offset),"scale_factor" => Float64(scale_factor)) + end else [] end @@ -246,10 +253,12 @@ for Tbase in (UInt8, Int8, Float32, Float64, Int64, Char, String) ncv = defVar(ds,"data",data,("lon","lat"),fillvalue = fv, attrib = varattrib) @test CDM.maskingvalue(ds) === _maskingvalue @test ncv.var[2,2] == fv - @test ncv[2,2] === _maskingvalue + converted_masking_value = + eltype(ncv) <: Number ? eltype(ncv)(_maskingvalue) : _maskingvalue + @test ncv[2,2] === converted_masking_value if Tbase <: Number @test ncv.var[1,1] * scale_factor + add_offset ≈ data[1,1] - @test ncv[1,1] ≈ data[1,1] + @test isapprox(ncv[1,1], data[1,1], rtol=10E-8, atol=10E-6) else @test ncv[1,1] == data[1,1] end diff --git a/test/test_subvariable.jl b/test/test_subvariable.jl index 601db15..8b48d5d 100644 --- a/test/test_subvariable.jl +++ b/test/test_subvariable.jl @@ -1,30 +1,14 @@ #import NCDatasets -using CommonDataModel: subsub, SubDataset, SubVariable, chunking, deflate, path, @select, varnames -using DataStructures +using CommonDataModel: SubDataset, SubVariable, chunking, deflate, path, @select, varnames +using CommonDataModel: MemoryDataset, defVar, groupby, select +using DataStructures, Dates using Test #TDS = NCDatasets.NCDataset TDS = MemoryDataset -@test subsub((1:10,),(2:10,)) == (2:10,) -@test subsub((2:10,),(2:9,)) == (3:10,) -@test subsub((2:2:10,),(2:3,)) == (4:2:6,) -@test subsub((:,),(2:4,)) == (2:4,) -@test subsub((2:2:10,),(3,)) == (6,) -@test subsub((2:2:10,:),(2:3,2:4)) == (4:2:6,2:4) -@test subsub((2:2:10,:),(2:3,2)) == (4:2:6,2) -@test subsub((1,:),(2:3,)) == (1,2:3) -@test subsub((1,:),(1,)) == (1,1) - -A = rand(10,10) -ip = (2:2:10,:) -i = (2:3,2:4) -j = subsub(ip,i) -A[ip...][i...] == A[j...] - - fname = tempname() @@ -73,9 +57,13 @@ ncvar_view = view(ncvar,1:3,1:4) ncvar_view.attrib["foo"] = "bar" @test ncvar_view.attrib["foo"] == "bar" @test ncvar.attrib["foo"] == "bar" -@test SubVariable(ncvar)[:,:] == data +@test Array(view(ncvar,:,:)) == data @test ncscalar[] == 12 +coords_names = CommonDataModel.coordinate_names(ncvar_view) +@test :lat in coords_names +@test :lon in coords_names + @test collect(view(ds,lon=1:3)["scalar"])[1] == 12 @test Array(view(ncvar,1:3,1:4)) == Array(view(data,1:3,1:4)) @@ -219,3 +207,48 @@ ss = ncsst3["time"] @test ss[1] == DateTime(2000,1,1) @test ndims(view(ncsst,:,1,1)) == 1 close(ds) + + +## test groupby +fname = tempname() +ds = TDS(fname,"c") + +nclon = defVar(ds,"lon", 1:7, ("lon",)) +nclat = defVar(ds,"lat", 1:10, ("lat",)) +times_array = collect(DateTime(2002,1,1):Day(1):DateTime(2002,3,20)) + +nctime = defVar(ds,"time", times_array, ("time",)) +ncsst = defVar(ds,"sst", ones(7,10,length(times_array)), ("lon", "lat", "time")) + +sst_view = view(ds["sst"],1:5,:1:4,:) +gv = groupby(sst_view,:time => Dates.Month) +sum_per_month = sum(gv) + +@test size(sum_per_month) == (5,4,3) +@test all(sum_per_month[:,:,1] .≈ 31.0) +@test all(sum_per_month[:,:,2] .≈ 28.0) +@test all(sum_per_month[:,:,3] .≈ 20.0) + +sst_view_time = view(ds["sst"],:,:,11:(length(times_array)-5)) +gv = groupby(sst_view_time,:time => Dates.Month) +sum_per_month = sum(gv) +@test all(sum_per_month[:,:,1] .≈ 21.0) +@test all(sum_per_month[:,:,2] .≈ 28.0) +@test all(sum_per_month[:,:,3] .≈ 15.0) + + +## test select +sst2_data = ones(7,10,length(times_array)) # setup test var +sst2_data[:,:,1] .*= collect(1:7) +sst2_data[:,:,2] .*= collect(1:10)' +ncsst2 = defVar(ds,"sst2", sst2_data, ("lon", "lat", "time")) + +sst_view = view(ds["sst2"],2:7,1:2:10,1:8) + +v = select(sst_view, + :lon => lon -> 1 <= lon <= 5, + :lat => lat -> 2 <= lat <= 7) + +@test size(v) == (4,3,8) +@test all(v[:,1,1] .≈ 2:5) +@test all(v[1,:,2] .≈ [3,5,7])