Add ChunkTiledDiskArray abstract type (#179)

meggart · web-flow · commit 5da88b3a8af7 · 2024-07-26T13:18:52.000+02:00
* expose base type for tiled diskarrays

* change supertype of cacheddiskarray

* bump version
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DiskArrays"
 uuid = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3"
 authors = ["Fabian Gans <fgans@bgc-jena.mpg.de>"]
-version = "0.4.4"
+version = "0.4.5"
 
 [deps]
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
diff --git a/src/cached.jl b/src/cached.jl
@@ -12,7 +12,7 @@ Wrap some disk array `A` with a caching mechanism that will
 keep chunks up to a total of `maxsize` megabytes, dropping
 the least used chunks when `maxsize` is exceeded.
 """
-struct CachedDiskArray{T,N,A<:AbstractArray{T,N},C} <: AbstractDiskArray{T,N}
+struct CachedDiskArray{T,N,A<:AbstractArray{T,N},C} <: ChunkTiledDiskArray{T,N}
     parent::A
     cache::C
 end
@@ -23,33 +23,21 @@ end
 
 Base.parent(A::CachedDiskArray) = A.parent
 Base.size(A::CachedDiskArray) = size(parent(A))
-# These could be more efficient with memory in some cases, but this is simple
-readblock!(A::CachedDiskArray, data, I...) = _readblock_cached!(A, data, I...)
-readblock!(A::CachedDiskArray, data, I::AbstractVector...) = _readblock_cached!(A, data, I...)
 # TODO we need to invalidate caches when we write
 # writeblock!(A::CachedDiskArray, data, I...) = writeblock!(parent(A), data, I...)
 
 haschunks(A::CachedDiskArray) = haschunks(parent(A))
 eachchunk(A::CachedDiskArray) = eachchunk(parent(A))
-
-function _readblock_cached!(A::CachedDiskArray{T,N}, data, I...) where {T,N}
-    chunks = eachchunk(A)
-    chunk_inds = findchunk.(chunks.chunks, I)
-    data_offset = OffsetArray(data,map(i->first(i)-1,I)...)
-    foreach(CartesianIndices(chunk_inds)) do ci
-        chunkindex = ChunkIndex(ci,offset=true)
-        chunk = get!(A.cache, chunkindex) do
-            res = parent(A)[chunkindex]
-            res
-        end
-        inner_indices = map(axes(chunk),axes(data_offset)) do ax1, ax2
-            max(first(ax1),first(ax2)):min(last(ax1),last(ax2))
-        end
-        for ii in CartesianIndices(inner_indices)
-            data_offset[ii] = chunk[ii]
-        end
+function getchunk(A::CachedDiskArray, i::ChunkIndex)
+    get!(A.cache, i) do
+        inds = eachchunk(A)[i.I]
+        chunk = parent(A)[inds...]
+        wrapchunk(chunk, inds)
     end
 end
+Base.getindex(A::CachedDiskArray, i::ChunkIndex{N,OffsetChunks}) where {N} = getchunk(A, i)
+Base.getindex(A::CachedDiskArray, i::ChunkIndex{N,OneBasedChunks}) where {N} = parent(getchunk(A, i))
+
 
 """
     cache(A::AbstractArray; maxsize=1000)
diff --git a/src/chunks.jl b/src/chunks.jl
@@ -272,8 +272,7 @@ haschunks(x) = Unchunked()
 
 struct OffsetChunks end
 struct OneBasedChunks end
-wrapchunk(::OneBasedChunks, x, _) = x
-wrapchunk(::OffsetChunks, x, inds) = OffsetArray(x, inds...)
+wrapchunk(x, inds) = OffsetArray(x, inds...)
 
 """
     ChunkIndex{N}
@@ -288,6 +287,9 @@ end
 function ChunkIndex(i::CartesianIndex; offset=false)
     return ChunkIndex(i, offset ? OffsetChunks() : OneBasedChunks())
 end
+"Removes the offset from a ChunkIndex"
+nooffset(i::ChunkIndex) = ChunkIndex(i.I, OneBasedChunks())
+
 ChunkIndex(i::Integer...; offset=false) = ChunkIndex(CartesianIndex(i); offset)
 
 """
@@ -336,6 +338,26 @@ function estimate_chunksize(s, si)
             return floor(Int, default_chunk_size[] * 1e6 / si / sbefore)
         end
     end
+    cs = clamp.(cs, 1, s)
     return GridChunks(s, cs)
 end
 
+
+
+abstract type ChunkTiledDiskArray{T,N} <: AbstractDiskArray{T,N} end
+Base.size(a::ChunkTiledDiskArray) = arraysize_from_chunksize.(eachchunk(a).chunks)
+function DiskArrays.readblock!(A::ChunkTiledDiskArray{T,N}, data, I...) where {T,N}
+    chunks = eachchunk(A)
+    chunk_inds = DiskArrays.findchunk.(chunks.chunks, I)
+    data_offset = OffsetArray(data, map(i -> first(i) - 1, I)...)
+    foreach(CartesianIndices(chunk_inds)) do ci
+        chunkindex = DiskArrays.ChunkIndex(ci, offset=true)
+        chunk = A[chunkindex]
+        inner_indices = map(axes(chunk), axes(data_offset)) do ax1, ax2
+            max(first(ax1), first(ax2)):min(last(ax1), last(ax2))
+        end
+        for ii in CartesianIndices(inner_indices)
+            data_offset[ii] = chunk[ii]
+        end
+    end
+end
diff --git a/src/diskarray.jl b/src/diskarray.jl
@@ -346,12 +346,10 @@ macro implement_getindex(t)
     t = esc(t)
     quote
         Base.getindex(a::$t, i...) = getindex_disk(a, i...)
-
-        function Base.getindex(a::$t, i::ChunkIndex)
-            cs = eachchunk(a)
-            inds = cs[i.I]
-            return wrapchunk(i.chunktype, a[inds...], inds)
-        end
+        @inline Base.getindex(a::$t, i::ChunkIndex{<:Any,OneBasedChunks}) =
+            a[eachchunk(a)[i.I]...]
+        @inline Base.getindex(a::$t, i::ChunkIndex{<:Any,OffsetChunks}) =
+            wrapchunk(a[nooffset(i)], eachchunk(a)[i.I])
         function DiskArrays.ChunkIndices(a::$t; offset=false)
             return ChunkIndices(
                 map(s->1:s,size(eachchunk(a))), offset ? OffsetChunks() : OneBasedChunks()