Skip to content

Commit d7fd921

Browse files
meggartlazarusA
andauthored
Export open_mfdataset (#470)
* upgrade and export open_mfdataset * use newest YAXArrayBase * Fix unrelated test * add dependabot * Apply suggestions from code review * fix locally --------- Co-authored-by: Lazaro Alonso <[email protected]>
1 parent f4253c2 commit d7fd921

File tree

4 files changed

+106
-39
lines changed

4 files changed

+106
-39
lines changed

.github/dependabot.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
2+
version: 2
3+
updates:
4+
- package-ecosystem: "github-actions"
5+
directory: "/" # Location of package manifests
6+
schedule:
7+
interval: "weekly"

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,5 @@ Statistics = "1"
5454
StatsBase = "0.32, 0.33, 0.34"
5555
Tables = "0.2, 1.0"
5656
WeightedOnlineStats = "0.3, 0.4, 0.5, 0.6"
57-
YAXArrayBase = "0.6, 0.7"
57+
YAXArrayBase = "0.7.5"
5858
julia = "1.9"

src/DatasetAPI/Datasets.jl

Lines changed: 97 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ using DiskArrays: DiskArrays, GridChunks
1313
using Glob: glob
1414
using DimensionalData: DimensionalData as DD
1515

16-
export Dataset, Cube, open_dataset, to_dataset, savecube, savedataset
16+
export Dataset, Cube, open_dataset, to_dataset, savecube, savedataset, open_mfdataset
1717

1818
"""
1919
Dataset object which stores an `OrderedDict` of YAXArrays with Symbol keys.
@@ -253,7 +253,7 @@ function collectdims(g)
253253
varnames = get_varnames(g)
254254
foreach(varnames) do k
255255
d = get_var_dims(g, k)
256-
v = get_var_handle(g, k)
256+
v = get_var_handle(g, k, persist=false)
257257
for (len, dname) in zip(size(v), d)
258258
if !occursin("bnd", dname) && !occursin("bounds", dname)
259259
datts = if dname in varnames
@@ -277,7 +277,7 @@ function toaxis(dimname, g, offs, len)
277277
if !haskey(g, dimname)
278278
return DD.rebuild(DD.name2dim(axname), 1:len)
279279
end
280-
ar = get_var_handle(g, dimname)
280+
ar = get_var_handle(g, dimname, persist=false)
281281
aratts = get_var_attrs(g, dimname)
282282
if match(r"^(days)|(hours)|(seconds)|(months) since",lowercase(get(aratts,"units",""))) !== nothing
283283
tsteps = try
@@ -337,6 +337,63 @@ open_mfdataset(g::AbstractString; kwargs...) = open_mfdataset(_glob(g); kwargs..
337337
open_mfdataset(g::Vector{<:AbstractString}; kwargs...) =
338338
merge_datasets(map(i -> open_dataset(i; kwargs...), g))
339339

340+
function merge_new_axis(alldatasets, firstcube,var,mergedim)
341+
newdim = DD.rebuild(mergedim,1:length(alldatasets))
342+
alldiskarrays = map(ds->ds.cubes[var].data,alldatasets).data
343+
newda = diskstack(alldiskarrays)
344+
newdims = (DD.dim(firstcube)...,newdim)
345+
YAXArray(newdims,newda,deepcopy(firstcube.properties))
346+
end
347+
function merge_existing_axis(alldatasets,firstcube,var,mergedim)
348+
allaxvals = map(ds->DD.dims(ds.cubes[var],mergedim).val,alldatasets)
349+
newaxvals = reduce(vcat,allaxvals)
350+
newdim = DD.rebuild(mergedim,newaxvals)
351+
alldiskarrays = map(ds->ds.cubes[var].data,alldatasets)
352+
istack = DD.dimnum(firstcube,mergedim)
353+
newshape = ntuple(i->i!=istack ? 1 : length(alldiskarrays),ndims(firstcube))
354+
newda = DiskArrays.ConcatDiskArray(reshape(alldiskarrays,newshape))
355+
newdims = Base.setindex(firstcube.axes,newdim,istack)
356+
YAXArray(newdims,newda,deepcopy(firstcube.properties))
357+
end
358+
359+
"""
360+
open_mfdataset(files::DD.DimVector{<:AbstractString}; kwargs...)
361+
362+
Opens and concatenates a list of dataset paths along the dimension specified in `files`.
363+
This method can be used when the generic glob-based version of open_mfdataset fails
364+
or is too slow.
365+
For example, to concatenate a list of annual NetCDF files along the `Ti` dimension,
366+
one can use:
367+
368+
````julia
369+
files = ["1990.nc","1991.nc","1992.nc"]
370+
open_mfdataset(DD.DimArray(files,DD.Ti()))
371+
````
372+
373+
alternatively, if the dimension to concatenate along does not exist yet, the
374+
dimension provided in the input arg is used:
375+
376+
````julia
377+
files = ["a.nc","b.nc","c.nc"]
378+
open_mfdataset(DD.DimArray(files,DD.Dim{:NewDim}(["a","b","c"])))
379+
````
380+
"""
381+
function open_mfdataset(vec::DD.DimVector{<:AbstractString};kwargs...)
382+
alldatasets = open_dataset.(vec;kwargs...);
383+
fi = first(alldatasets)
384+
mergedim = DD.dims(alldatasets) |> only
385+
ars = map(collect(keys(fi.cubes))) do var
386+
cfi = fi.cubes[var]
387+
mergedar = if DD.dims(cfi,mergedim) !== nothing
388+
merge_existing_axis(alldatasets,cfi,var,mergedim)
389+
else
390+
merge_new_axis(alldatasets,cfi,var,mergedim)
391+
end
392+
var => mergedar
393+
end
394+
Dataset(;ars...)
395+
end
396+
340397

341398
"""
342399
open_dataset(g; driver=:all)
@@ -345,44 +402,46 @@ Open the dataset at `g` with the given `driver`.
345402
The default driver will search for available drivers and tries to detect the useable driver from the filename extension.
346403
"""
347404
function open_dataset(g; driver = :all)
348-
g = YAXArrayBase.to_dataset(g, driver = driver)
349-
isempty(get_varnames(g)) && throw(ArgumentError("Group does not contain datasets."))
350-
dimlist = collectdims(g)
351-
dnames = string.(keys(dimlist))
352-
varlist = filter(get_varnames(g)) do vn
353-
upname = uppercase(vn)
354-
!occursin("BNDS", upname) &&
355-
!occursin("BOUNDS", upname) &&
356-
!any(i -> isequal(upname, uppercase(i)), dnames)
357-
end
358-
allcubes = OrderedDict{Symbol,YAXArray}()
359-
for vname in varlist
360-
vardims = get_var_dims(g, vname)
361-
iax = tuple(collect(dimlist[vd].ax for vd in vardims)...)
362-
offs = [dimlist[vd].offs for vd in vardims]
363-
subs = if all(iszero, offs)
364-
nothing
365-
else
366-
ntuple(i -> (offs[i]+1):(offs[i]+length(iax[i])), length(offs))
367-
end
368-
ar = get_var_handle(g, vname)
369-
att = get_var_attrs(g, vname)
370-
if subs !== nothing
371-
ar = view(ar, subs...)
405+
dsopen = YAXArrayBase.to_dataset(g, driver = driver)
406+
YAXArrayBase.open_dataset_handle(dsopen) do g
407+
isempty(get_varnames(g)) && throw(ArgumentError("Group does not contain datasets."))
408+
dimlist = collectdims(g)
409+
dnames = string.(keys(dimlist))
410+
varlist = filter(get_varnames(g)) do vn
411+
upname = uppercase(vn)
412+
!occursin("BNDS", upname) &&
413+
!occursin("BOUNDS", upname) &&
414+
!any(i -> isequal(upname, uppercase(i)), dnames)
372415
end
373-
if !haskey(att, "name")
374-
att["name"] = vname
375-
end
376-
atts = propfromattr(att)
377-
if any(in(keys(atts)), ["missing_value", "scale_factor", "add_offset"])
378-
ar = CFDiskArray(ar, atts)
416+
allcubes = OrderedDict{Symbol,YAXArray}()
417+
for vname in varlist
418+
vardims = get_var_dims(g, vname)
419+
iax = tuple(collect(dimlist[vd].ax for vd in vardims)...)
420+
offs = [dimlist[vd].offs for vd in vardims]
421+
subs = if all(iszero, offs)
422+
nothing
423+
else
424+
ntuple(i -> (offs[i]+1):(offs[i]+length(iax[i])), length(offs))
425+
end
426+
ar = get_var_handle(g, vname,persist=true)
427+
att = get_var_attrs(g, vname)
428+
if subs !== nothing
429+
ar = view(ar, subs...)
430+
end
431+
if !haskey(att, "name")
432+
att["name"] = vname
433+
end
434+
atts = propfromattr(att)
435+
if any(in(keys(atts)), ["missing_value", "scale_factor", "add_offset"])
436+
ar = CFDiskArray(ar, atts)
437+
end
438+
allcubes[Symbol(vname)] = YAXArray(iax, ar, atts, cleaner = CleanMe[])
379439
end
380-
allcubes[Symbol(vname)] = YAXArray(iax, ar, atts, cleaner = CleanMe[])
440+
gatts = YAXArrayBase.get_global_attrs(g)
441+
gatts = Dict{String,Any}(string(k)=>v for (k,v) in gatts)
442+
sdimlist = Dict(DD.name(v.ax) => v.ax for (k, v) in dimlist)
443+
Dataset(allcubes, sdimlist,gatts)
381444
end
382-
gatts = YAXArrayBase.get_global_attrs(g)
383-
gatts = Dict{String,Any}(string(k)=>v for (k,v) in gatts)
384-
sdimlist = Dict(DD.name(v.ax) => v.ax for (k, v) in dimlist)
385-
Dataset(allcubes, sdimlist,gatts)
386445
end
387446
#Base.getindex(x::Dataset; kwargs...) = subsetcube(x; kwargs...)
388447
YAXDataset(; kwargs...) = Dataset(YAXArrays.YAXDefaults.cubedir[]; kwargs...)

test/DAT/mapcube.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
end
6363

6464
@testset "Error shown in parallel" begin
65+
import Zarr
6566
x,y,z = X(1:4), Y(1:5), Z(1:6)
6667
a1 = YAXArray((x,y,z), rand(4,5,6))
6768
indims = InDims("x")

0 commit comments

Comments
 (0)