Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 17 additions & 15 deletions src/cat.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
ConcatDiskArray <: AbstractDiskArray

Expand All @@ -15,7 +14,7 @@ Returned from `cat` on disk arrays.

It is also useful on its own as it can easily concatenate an array of disk arrays.
"""
struct ConcatDiskArray{T,N,P,C,HC,ID} <: AbstractDiskArray{T,N}
struct ConcatDiskArray{T,N,P,C,HC, ID} <: AbstractDiskArray{T,N}
parents::P
startinds::NTuple{N,Vector{Int}}
size::NTuple{N,Int}
Expand All @@ -24,23 +23,25 @@ struct ConcatDiskArray{T,N,P,C,HC,ID} <: AbstractDiskArray{T,N}
innerdims::Val{ID}
end

function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}})
function ConcatDiskArray(arrays::AbstractArray{Union{<:AbstractArray,Missing}}; fill=missing)
et = Base.nonmissingtype(eltype(arrays))
T = Union{Missing,eltype(et)}
T = promotetype(typeof(fill), eltype(et))
Copy link
Collaborator

@rafaqz rafaqz Apr 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the passed in array would still contain either arrays or missing values, but we would just return fill for those sections, so fill would need to be a struct field.

So what we have here is kinda half way. Probably we should either drop the fill keyword (and just rely on the array values) or keep it as a struct field and use missing in the array.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to drop the fill keyword. I actual thought I already did so.
With my approach we can have different fill values in different parts of the concatenated array.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But is that actually useful? I can't imagine a real use case for multiple fill values. And putting missing in the array seems clearer for representing missing arrays.

The missing value used in the array of arrays doesn't have to be coupled to the fill value.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that it might be a bit dangerous to allow arbitrary values to represent missings. OTOH one never knows when this might become useful for some reason. I am ok with both ways and would currently slightly prefer @felixcremer s solution because there is already an implementation. If we want to be a bit more on the safe side, we could introduce a small wrapper type like

struct MissingTile{F}
    fillvalue::F
end

that explicitly marks missing entries to be concatenated and how they should be filled. The main reason why the current solution might be problematic is for DiskArrays with arrays as eltype, but maybe this is overthinking things a bit too much.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
T = promotetype(typeof(fill), eltype(et))
T = Union{Missing,eltype(et)}

Copy link
Collaborator

@rafaqz rafaqz Apr 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you have to read all the fill values in the array and promote them? Missing is not necessarily in the type?

N = ndims(arrays)
M = ndims(et)
_ConcatDiskArray(arrays, T, Val(N), Val(M))
end

function infer_eltypes(arrays)
foldl(arrays, init=(-1, Union{})) do (M, T), a
if ismissing(a)
(M, promote_type(Missing, T))
if !isa(a, AbstractArray)
(M, promote_type(typeof(a), T))
else
M == -1 || ndims(a) == M || throw(ArgumentError("All arrays to concatenate must have equal ndims"))
(ndims(a), promote_type(eltype(a), T))
end
end
end

function ConcatDiskArray(arrays::AbstractArray{<:AbstractArray})
N = ndims(arrays)
T = eltype(eltype(arrays))
Expand Down Expand Up @@ -90,7 +91,7 @@ function arraysize_and_startinds(arrays1)
sizes = map(i -> zeros(Int, i), size(arrays1))
for i in CartesianIndices(arrays1)
ai = arrays1[i]
ismissing(ai) && continue
!isa(ai, AbstractArray) && continue
sizecur = extenddims(size(ai), size(arrays1), 1)
foreach(sizecur, i.I, sizes) do si, ind, sizeall
if sizeall[ind] == 0
Expand Down Expand Up @@ -123,10 +124,11 @@ function readblock!(a::ConcatDiskArray, aout, inds::AbstractUnitRange...)
# Find affected blocks and indices in blocks
_concat_diskarray_block_io(a, inds...) do outer_range, array_range, I
vout = view(aout, outer_range...)
if ismissing(I)
vout .= missing
else
#@show size(vout)
if I isa CartesianIndex
readblock!(a.parents[I], vout, array_range...)
else
vout .= I
end
end
end
Expand Down Expand Up @@ -170,10 +172,10 @@ function _concat_diskarray_block_io(f, a::ConcatDiskArray, inds...)
#Shorten array range to shape of actual array
array_range = ntuple(j -> array_range[j], ID)
outer_range = fix_outerrangeshape(outer_range, array_range)
if ismissing(myar)
f(outer_range, array_range, missing)
else
if myar isa AbstractArray
f(outer_range, array_range, cI)
else
f(outer_range, array_range, myar)
end
end
end
Expand All @@ -189,13 +191,13 @@ function concat_chunksize(parents)
newchunks = map(s -> Vector{Union{RegularChunks,IrregularChunks}}(undef, s), size(parents))
for i in CartesianIndices(parents)
array = parents[i]
ismissing(array) && continue
!isa(array,AbstractArray) && continue
chunks = eachchunk(array)
foreach(chunks.chunks, i.I, newchunks) do c, ind, newc
if !isassigned(newc, ind)
newc[ind] = c
elseif c != newc[ind]
throw(ArgumentError("Chunk sizes don't forma grid"))
throw(ArgumentError("Chunk sizes don't form a grid"))
end
end
end
Expand Down
29 changes: 29 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,34 @@ end
@test slic == Float64[1, 2, 3, 4, 1, 2, 3, 4]
end

@testset "Concat DiskArray with fill zero tiles" begin
a = zeros(Int, 3, 4)
b = ones(Int, 2, 4)
c = fill(2, 3, 5)
d = fill(0, 2, 5)
aconc = DiskArrays.ConcatDiskArray(reshape([a, b, c, 0], 2, 2))
abase = [a c; b d]
@test all(isequal.(aconc[:, :], abase))
@test all(isequal.(aconc[3:4, 4:6], abase[3:4, 4:6]))
ch = DiskArrays.eachchunk(aconc)
@test ch.chunks[1] == [1:3, 4:5]
@test ch.chunks[2] == [1:4, 5:9]

a = ones(100, 50)
b = [rem(i.I[3], 5) == 0 ? 0 : a for i in CartesianIndices((1, 1, 100))]
b[1] = 0
a_conc = DiskArrays.ConcatDiskArray(b)
ch = eachchunk(a_conc)
@test ch.chunks[1] == [1:100]
@test ch.chunks[2] == [1:50]
@test ch.chunks[3] === DiskArrays.RegularChunks(1, 0, 100)

@test all(isequal.(a_conc[2, 2, 1:5], [0, 1.0, 1.0, 1.0, 0]))
@test all(isequal.(a_conc[end, end, 95:100], [0, 1.0, 1.0, 1.0, 1.0, 0]))

end


@testset "Concat DiskArray with missing tiles" begin
a = zeros(Int, 3, 4)
b = ones(Int, 2, 4)
Expand All @@ -518,6 +546,7 @@ end
@test all(isequal.(a_conc[end, end, 95:100], [missing, 1.0, 1.0, 1.0, 1.0, missing]))

end

end

@testset "Broadcast with length 1 and 0 final dim" begin
Expand Down