SciML
diff --git a/‎src/independentlylinearizedutils.jl
Lines changed: 201 additions & 36 deletions b/‎src/independentlylinearizedutils.jl
Lines changed: 201 additions & 36 deletions
@@ -2,30 +2,130 @@ using SciMLBase
 
 export IndependentlyLinearizedSolution
 
+
+"""
+    CachePool(T, alloc; thread_safe = true)
+
+Simple memory-reusing cache that allows us to grow a cache and keep
+re-using those pieces of memory (in our case, typically `u` vectors)
+until the solve is finished.  By default, this datastructure is made
+to be thread-safe by locking on every acquire and release, but it
+can be made thread-unsafe (and correspondingly faster) by passing
+`thread_safe = false` to the constructor.
+
+While manual usage with `acquire!()` and `release!()` is possible,
+most users will want to use `@with_cache`, which provides lexically-
+scoped `acquire!()` and `release!()` usage automatically.  Example:
+
+```julia
+us = CachePool(Vector{S}, () -> Vector{S}(undef, num_us); thread_safe=false)
+@with_cache us u_prev begin
+    @with_cache us u_next begin
+        # perform tasks with these two `u` vectors
+    end
+end
+```
+"""
+mutable struct CachePool{T, THREAD_SAFE}
+    pool::Vector{T}
+    alloc::Function
+    lock::ReentrantLock
+    num_alloced::Int
+
+    function CachePool(T, alloc::F; thread_safe::Bool = true) where {F}
+        return new{T,Val{thread_safe}}(T[], alloc, ReentrantLock(), 0)
+    end
+end
+const ThreadSafeCachePool{T} = CachePool{T,Val{true}}
+const ThreadUnsafeCachePool{T} = CachePool{T,Val{false}}
+
+"""
+    acquire!(cache::CachePool)
+
+Returns a cached element of the cache pool, calling `cache.alloc()` if none
+are available.
+"""
+Base.@inline function acquire!(cache::CachePool{T}, _dummy = nothing) where {T}
+    if isempty(cache.pool)
+        cache.num_alloced += 1
+        return cache.alloc()::T
+    end
+    return pop!(cache.pool)
+end
+
+"""
+    release!(cache::CachePool, val)
+
+Returns the value `val` to the cache pool.
+"""
+Base.@inline function release!(cache::CachePool, val, _dummy = nothing)
+    push!(cache.pool, val)
+end
+
+# Thread-safe versions just sub out to the other methods, using `_dummy` to force correct dispatch
+acquire!(cache::ThreadSafeCachePool) = @lock cache.lock acquire!(cache, nothing)
+release!(cache::ThreadSafeCachePool, val) = @lock cache.lock release!(cache, val, nothing)
+
+macro with_cache(cache, name, body)
+    return quote
+        $(esc(name)) = acquire!($(esc(cache)))
+        try
+            $(esc(body))
+        finally
+            release!($(esc(cache)), $(esc(name)))
+        end
+    end
+end
+
+
+struct IndependentlyLinearizedSolutionChunksCache{T,S}
+    t_chunks::ThreadUnsafeCachePool{Vector{T}}
+    u_chunks::ThreadUnsafeCachePool{Matrix{S}}
+    time_masks::ThreadUnsafeCachePool{BitMatrix}
+
+    function IndependentlyLinearizedSolutionChunksCache{T,S}(num_us::Int, num_derivatives::Int, chunk_size::Int) where {T,S}
+        t_chunks_alloc = () -> Vector{T}(undef, chunk_size)
+        u_chunks_alloc = () -> Matrix{S}(undef, num_derivatives+1, chunk_size)
+        time_masks_alloc = () -> BitMatrix(undef, num_us, chunk_size)
+        return new(
+            CachePool(Vector{T}, t_chunks_alloc; thread_safe=false),
+            CachePool(Matrix{S}, u_chunks_alloc; thread_safe=false),
+            CachePool(BitMatrix, time_masks_alloc; thread_safe=false),
+        )
+    end
+end
+
 """
     IndependentlyLinearizedSolutionChunks
 
 When constructing an `IndependentlyLinearizedSolution` via the `IndependentlyLinearizingCallback`,
 we use this indermediate structure to reduce allocations and collect the unknown number of timesteps
 that the solve will generate.
 """
-mutable struct IndependentlyLinearizedSolutionChunks{T, S}
+mutable struct IndependentlyLinearizedSolutionChunks{T, S, N}
     t_chunks::Vector{Vector{T}}
     u_chunks::Vector{Vector{Matrix{S}}}
     time_masks::Vector{BitMatrix}
 
+    # Temporary array that gets used by `get_chunks`
+    last_chunks::Vector{Matrix{S}}
+
     # Index of next write into the last chunk
     u_offsets::Vector{Int}
     t_offset::Int
 
+    cache::IndependentlyLinearizedSolutionChunksCache
+
     function IndependentlyLinearizedSolutionChunks{T, S}(num_us::Int, num_derivatives::Int = 0,
-            chunk_size::Int = 100) where {T, S}
-        return new([Vector{T}(undef, chunk_size)],
-            [[Matrix{S}(undef, num_derivatives+1, chunk_size)] for _ in 1:num_us],
-            [BitMatrix(undef, num_us, chunk_size)],
-            [1 for _ in 1:num_us],
-            1,
-        )
+            chunk_size::Int = 512,
+            cache::IndependentlyLinearizedSolutionChunksCache = IndependentlyLinearizedSolutionChunksCache{T,S}(num_us, num_derivatives, chunk_size)) where {T, S}
+        t_chunks = [acquire!(cache.t_chunks)]
+        u_chunks = [[acquire!(cache.u_chunks)] for _ in 1:num_us]
+        time_masks = [acquire!(cache.time_masks)]
+        last_chunks = [u_chunks[u_idx][1] for u_idx in 1:num_us]
+        u_offsets = [1 for _ in 1:num_us]
+        t_offset = 1
+        return new{T,S,num_derivatives}(t_chunks, u_chunks, time_masks, last_chunks, u_offsets, t_offset, cache)
     end
 end
 
@@ -44,14 +144,8 @@ function num_us(ilsc::IndependentlyLinearizedSolutionChunks)
     end
     return length(ilsc.u_chunks)
 end
+num_derivatives(ilsc::IndependentlyLinearizedSolutionChunks{T,S,N}) where {T,S,N} = N
 
-function num_derivatives(ilsc::IndependentlyLinearizedSolutionChunks)
-    # If we've been finalized, just return `0` (which means only the primal)
-    if isempty(ilsc.t_chunks)
-        return 0
-    end
-    return size(first(first(ilsc.u_chunks)), 1) - 1
-end
 
 function Base.isempty(ilsc::IndependentlyLinearizedSolutionChunks)
     return length(ilsc.t_chunks) == 1 && ilsc.t_offset == 1
@@ -61,24 +155,25 @@ function get_chunks(ilsc::IndependentlyLinearizedSolutionChunks{T, S}) where {T,
     # Check if we need to allocate new `t` chunk
     chunksize = chunk_size(ilsc)
     if ilsc.t_offset > chunksize
-        push!(ilsc.t_chunks, Vector{T}(undef, chunksize))
-        push!(ilsc.time_masks, BitMatrix(undef, length(ilsc.u_offsets), chunksize))
+        push!(ilsc.t_chunks, acquire!(ilsc.cache.t_chunks))
+        push!(ilsc.time_masks, acquire!(ilsc.cache.time_masks))
         ilsc.t_offset = 1
     end
 
     # Check if we need to allocate any new `u` chunks (but only for those with `u_mask`)
     for (u_idx, u_chunks) in enumerate(ilsc.u_chunks)
         if ilsc.u_offsets[u_idx] > chunksize
-            push!(u_chunks, Matrix{S}(undef, num_derivatives(ilsc)+1, chunksize))
+            push!(u_chunks, acquire!(ilsc.cache.u_chunks))
             ilsc.u_offsets[u_idx] = 1
         end
+        ilsc.last_chunks[u_idx] = u_chunks[end]
     end
 
     # return the last chunk for each
     return (
         ilsc.t_chunks[end],
         ilsc.time_masks[end],
-        [u_chunks[end] for u_chunks in ilsc.u_chunks],
+        ilsc.last_chunks,
     )
 end
 
@@ -135,16 +230,18 @@ function store!(ilsc::IndependentlyLinearizedSolutionChunks{T, S},
     ts, time_mask, us = get_chunks(ilsc)
 
     # Store into the chunks, gated by `u_mask`
-    for u_idx in 1:size(u, 2)
+    @inbounds for u_idx in 1:size(u, 2)
         if u_mask[u_idx]
             for deriv_idx in 1:size(u, 1)
                 us[u_idx][deriv_idx, ilsc.u_offsets[u_idx]] = u[deriv_idx, u_idx]
             end
             ilsc.u_offsets[u_idx] += 1
         end
+
+        # Update our `time_mask` while we're at it
+        time_mask[u_idx, ilsc.t_offset] = u_mask[u_idx]
     end
     ts[ilsc.t_offset] = t
-    time_mask[:, ilsc.t_offset] .= u_mask
     ilsc.t_offset += 1
 end
 
@@ -161,7 +258,7 @@ efficient `iterate()` method that can be used to reconstruct coherent views
 of the state variables at all timepoints, as well as an efficient `sample!()`
 method that can sample at arbitrary timesteps.
 """
-mutable struct IndependentlyLinearizedSolution{T, S}
+mutable struct IndependentlyLinearizedSolution{T, S, N}
     # All timepoints, shared by all `us`
     ts::Vector{T}
 
@@ -174,32 +271,44 @@ mutable struct IndependentlyLinearizedSolution{T, S}
 
     # Temporary object used during construction, will be set to `nothing` at the end.
     ilsc::Union{Nothing,IndependentlyLinearizedSolutionChunks{T,S}}
+    ilsc_cache_pool::Union{Nothing,ThreadSafeCachePool{IndependentlyLinearizedSolutionChunksCache{T,S}}}
 end
 # Helper function to create an ILS wrapped around an in-progress ILSC
-function IndependentlyLinearizedSolution(ilsc::IndependentlyLinearizedSolutionChunks{T,S}) where {T,S}
-    ils = IndependentlyLinearizedSolution(
+function IndependentlyLinearizedSolution(ilsc::IndependentlyLinearizedSolutionChunks{T,S,N}, cache_pool = nothing) where {T,S,N}
+    return IndependentlyLinearizedSolution{T,S,N}(
         T[],
         Matrix{S}[],
         BitMatrix(undef, 0,0),
         ilsc,
+        cache_pool,
     )
-    return ils
 end
 # Automatically create an ILS wrapped around an ILSC from a `prob`
-function IndependentlyLinearizedSolution(prob::SciMLBase.AbstractDEProblem, num_derivatives = 0)
+function IndependentlyLinearizedSolution(prob::SciMLBase.AbstractDEProblem, num_derivatives = 0;
+                                         cache_pool = nothing,
+                                         chunk_size::Int = 512)
     T = eltype(prob.tspan)
+    S = eltype(prob.u0)
     U = isnothing(prob.u0) ? Float64 : eltype(prob.u0)
-    N = isnothing(prob.u0) ? 0 : length(prob.u0)
-    chunks = IndependentlyLinearizedSolutionChunks{T,U}(N, num_derivatives)
-    return IndependentlyLinearizedSolution(chunks)
+    num_us = isnothing(prob.u0) ? 0 : length(prob.u0)
+    if cache_pool === nothing
+        cache_pool = CachePool(
+            IndependentlyLinearizedSolutionChunksCache{T,S},
+            () -> IndependentlyLinearizedSolutionChunksCache{T,S}(num_us, num_derivatives, chunk_size);
+            thread_safe = true,
+        )
+    end
+    cache = acquire!(cache_pool)
+    chunks = IndependentlyLinearizedSolutionChunks{T,U}(num_us, num_derivatives, chunk_size, cache)
+    return IndependentlyLinearizedSolution(chunks, cache_pool)
 end
 
-num_derivatives(ils::IndependentlyLinearizedSolution) = !isempty(ils.us) ? size(first(ils.us), 1) : 0
+num_derivatives(::IndependentlyLinearizedSolution{T,S,N}) where {T,S,N} = N
 num_us(ils::IndependentlyLinearizedSolution) = length(ils.us)
 Base.size(ils::IndependentlyLinearizedSolution) = size(ils.time_mask)
 Base.length(ils::IndependentlyLinearizedSolution) = length(ils.ts)
 
-function finish!(ils::IndependentlyLinearizedSolution)
+function finish!(ils::IndependentlyLinearizedSolution{T,S}) where {T,S}
     function trim_chunk(chunks::Vector, offset)
         chunks = [chunk for chunk in chunks]
         if eltype(chunks) <: AbstractVector
@@ -216,10 +325,52 @@ function finish!(ils::IndependentlyLinearizedSolution)
     end
 
     ilsc = ils.ilsc::IndependentlyLinearizedSolutionChunks
-    ts = vcat(trim_chunk(ilsc.t_chunks, ilsc.t_offset)...)
-    time_mask = hcat(trim_chunk(ilsc.time_masks, ilsc.t_offset)...)
-    us = [hcat(trim_chunk(ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx])...)
-          for u_idx in 1:length(ilsc.u_chunks)]
+
+    chunk_len(chunk) = size(chunk, ndims(chunk))
+    function chunks_len(chunks::Vector, offset)
+        len = 0
+        for chunk_idx in 1:length(chunks)-1
+            len += chunk_len(chunks[chunk_idx])
+        end
+        return len + offset - 1
+    end
+
+    function copy_chunk!(out::Vector, in::Vector, out_offset::Int, len=chunk_len(in))
+        for idx in 1:len
+            out[idx+out_offset] = in[idx]
+        end
+    end
+    function copy_chunk!(out::AbstractMatrix, in::AbstractMatrix, out_offset::Int, len=chunk_len(in))
+        for zdx in 1:size(in, 1)
+            for idx in 1:len
+                out[zdx, idx+out_offset] = in[zdx, idx]
+            end
+        end
+    end
+
+    function collapse_chunks!(out, chunks, offset::Int)
+        write_offset = 0
+        for chunk_idx in 1:(length(chunks)-1)
+            chunk = chunks[chunk_idx]
+            copy_chunk!(out, chunk, write_offset)
+            write_offset += chunk_len(chunk)
+        end
+        copy_chunk!(out, chunks[end], write_offset, offset-1)
+    end
+
+    # Collapse t_chunks
+    ts = Vector{T}(undef, chunks_len(ilsc.t_chunks, ilsc.t_offset))
+    collapse_chunks!(ts, ilsc.t_chunks, ilsc.t_offset)
+
+    # Collapse u_chunks
+    us = Vector{Matrix{S}}(undef, length(ilsc.u_chunks))
+    for u_idx in 1:length(ilsc.u_chunks)
+        us[u_idx] = Matrix{S}(undef, size(ilsc.u_chunks[u_idx][1],1), chunks_len(ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx]))
+        collapse_chunks!(us[u_idx], ilsc.u_chunks[u_idx], ilsc.u_offsets[u_idx])
+    end
+
+    time_mask = BitMatrix(undef, size(ilsc.time_masks[1], 1), chunks_len(ilsc.time_masks, ilsc.t_offset))
+    collapse_chunks!(time_mask, ilsc.time_masks, ilsc.t_offset)
 
     # Sanity-check lengths
     if length(ts) != size(time_mask, 2)
@@ -238,7 +389,21 @@ function finish!(ils::IndependentlyLinearizedSolution)
         throw(ArgumentError("Time mask must indicate same length as `us` ($(time_mask_lens) != $(us_lens))"))
     end
 
-    # Update our struct, release the `ilsc`
+    # Update our struct, release the `ilsc` and its caches
+    for t_chunk in ilsc.t_chunks
+        release!(ilsc.cache.t_chunks, t_chunk)
+    end
+    for u_idx in 1:length(ilsc.u_chunks)
+        for u_chunk in ilsc.u_chunks[u_idx]
+            release!(ilsc.cache.u_chunks, u_chunk)
+        end
+    end
+    for time_mask in ilsc.time_masks
+        release!(ilsc.cache.time_masks, time_mask)
+    end
+    if ils.ilsc_cache_pool !== nothing
+        release!(ils.ilsc_cache_pool, ilsc.cache)
+    end
     ils.ilsc = nothing
     ils.ts = ts
     ils.us = us