diff --git a/src/ensemblegpukernel/lowerlevel_solve.jl b/src/ensemblegpukernel/lowerlevel_solve.jl index b3c48779..8aceddac 100644 --- a/src/ensemblegpukernel/lowerlevel_solve.jl +++ b/src/ensemblegpukernel/lowerlevel_solve.jl @@ -36,6 +36,7 @@ function vectorized_solve(probs, prob::ODEProblem, alg; prob = convert(ImmutableODEProblem, prob) dt = convert(eltype(prob.tspan), dt) + saveat_converted = nothing if saveat === nothing if save_everystep @@ -51,34 +52,30 @@ function vectorized_solve(probs, prob::ODEProblem, alg; fill!(ts, prob.tspan[1]) us = allocate(backend, typeof(prob.u0), (len, length(probs))) else - saveat = if saveat isa AbstractRange - _saveat = range(convert(eltype(prob.tspan), first(saveat)), - convert(eltype(prob.tspan), last(saveat)), - length = length(saveat)) - convert( - StepRangeLen{ - eltype(_saveat), - eltype(_saveat), - eltype(_saveat), - eltype(_saveat) === Float32 ? Int32 : Int64 - }, - _saveat) - elseif saveat isa AbstractVector - adapt(backend, convert.(eltype(prob.tspan), saveat)) + # Get the time type from the problem + Tt = eltype(prob.tspan) + + # FIX for Issue #379: Convert saveat to proper type + saveat_converted = if saveat isa AbstractRange || saveat isa AbstractArray + Tt.(collect(saveat)) else - _saveat = prob.tspan[1]:convert(eltype(prob.tspan), saveat):prob.tspan[end] - convert( - StepRangeLen{ - eltype(_saveat), - eltype(_saveat), - eltype(_saveat), - eltype(_saveat) === Float32 ? Int32 : Int64 - }, - _saveat) + # saveat is a Number (step size) + t0, tf = Tt.(prob.tspan) + if Tt(saveat) == Tt(0.0) + Tt.([t0, tf]) + else + num_points = Int(ceil(abs(tf - t0) / abs(Tt(saveat)))) + 1 + Tt.(collect(range(t0, tf, length = num_points))) + end end - ts = allocate(backend, typeof(dt), (length(saveat), length(probs))) + + ts = allocate(backend, typeof(dt), (length(saveat_converted), length(probs))) fill!(ts, prob.tspan[1]) - us = allocate(backend, typeof(prob.u0), (length(saveat), length(probs))) + us = allocate(backend, typeof(prob.u0), (length(saveat_converted), length(probs))) + end + + if saveat_converted !== nothing + saveat_converted = adapt(backend, saveat_converted) end tstops = adapt(backend, tstops) @@ -89,7 +86,7 @@ function vectorized_solve(probs, prob::ODEProblem, alg; @warn "Running the kernel on CPU" end - kernel(probs, alg, us, ts, dt, callback, tstops, nsteps, saveat, + kernel(probs, alg, us, ts, dt, callback, tstops, nsteps, saveat_converted, Val(save_everystep); ndrange = length(probs)) @@ -111,7 +108,7 @@ function vectorized_solve(probs, prob::SDEProblem, alg; backend = maybe_prefer_blocks(backend) dt = convert(eltype(prob.tspan), dt) - + saveat_converted = nothing if saveat === nothing if save_everystep len = length(prob.tspan[1]:dt:prob.tspan[2]) @@ -122,20 +119,30 @@ function vectorized_solve(probs, prob::SDEProblem, alg; fill!(ts, prob.tspan[1]) us = allocate(backend, typeof(prob.u0), (len, length(probs))) else - saveat = if saveat isa AbstractRange - range(convert(eltype(prob.tspan), first(saveat)), - convert(eltype(prob.tspan), last(saveat)), - length = length(saveat)) - elseif saveat isa AbstractVector - convert.(eltype(prob.tspan), adapt(backend, saveat)) + # Get the time type from the problem + Tt = eltype(prob.tspan) + + # FIX for Issue #379: Convert saveat to proper type + saveat_converted = if saveat isa AbstractRange || saveat isa AbstractArray + Tt.(collect(saveat)) else - prob.tspan[1]:convert(eltype(prob.tspan), saveat):prob.tspan[end] + # saveat is a Number (step size) + t0, tf = Tt.(prob.tspan) + if Tt(saveat) == Tt(0.0) + Tt.([t0, tf]) + else + num_points = Int(ceil(abs(tf - t0) / abs(Tt(saveat)))) + 1 + Tt.(collect(range(t0, tf, length = num_points))) + end end - ts = allocate(backend, typeof(dt), (length(saveat), length(probs))) + + ts = allocate(backend, typeof(dt), (length(saveat_converted), length(probs))) fill!(ts, prob.tspan[1]) - us = allocate(backend, typeof(prob.u0), (length(saveat), length(probs))) + us = allocate(backend, typeof(prob.u0), (length(saveat_converted), length(probs))) + end + if saveat_converted !== nothing + saveat_converted = adapt(backend, saveat_converted) end - if alg isa GPUEM kernel = em_kernel(backend) elseif alg isa Union{GPUSIEA} @@ -148,7 +155,7 @@ function vectorized_solve(probs, prob::SDEProblem, alg; @warn "Running the kernel on CPU" end - kernel(probs, us, ts, dt, saveat, Val(save_everystep); + kernel(probs, us, ts, dt, saveat_converted, Val(save_everystep); ndrange = length(probs)) ts, us end @@ -184,47 +191,76 @@ function vectorized_asolve(probs, prob::ODEProblem, alg; abstol = 1.0f-6, reltol = 1.0f-3, debug = false, callback = CallbackSet(nothing), tstops = nothing, kwargs...) + backend = get_backend(probs) backend = maybe_prefer_blocks(backend) - prob = convert(ImmutableODEProblem, prob) + # Get the time type from the problem + Tt = eltype(prob.tspan) + + # FIX for Issue #379: Convert saveat to eliminate + # StepRangeLen's internal Float64 fields which crash Metal + + if saveat !== nothing + if saveat isa Number + # Handle edge case: saveat = 0.0 means only save endpoints + if Tt(saveat) == Tt(0.0) + saveat_converted = Tt.([prob.tspan[1], prob.tspan[2]]) + else + # Create proper range with correct type + t0, tf = Tt.(prob.tspan) + + # Handle both forward and reverse time integration + num_points = Int(ceil(abs(tf - t0) / abs(Tt(saveat)))) + 1 + + # Safety check: prevent massive arrays + max_saveat_length = 100_000 + if num_points > max_saveat_length + error("saveat would create too many save points ($num_points). " * + "Consider using a larger saveat value.") + end + + # Create range and convert to pure Vector{Tt} + saveat_range = range(t0, tf, length = num_points) + saveat_converted = Tt.(collect(saveat_range)) + end + elseif saveat isa AbstractRange || saveat isa AbstractArray + # Range or array - convert all elements to Tt + # This eliminates StepRangeLen's Float64 internals + saveat_converted = Tt.(collect(saveat)) + else + # Already in correct form + saveat_converted = saveat + end + else + saveat_converted = nothing + end + prob = convert(ImmutableODEProblem, prob) dt = convert(eltype(prob.tspan), dt) - abstol = convert(eltype(prob.tspan), abstol) - reltol = convert(eltype(prob.tspan), reltol) - # if saveat is specified, we'll use a vector of timestamps. - # otherwise it's a matrix that may be different for each ODE. - if saveat === nothing + + if saveat_converted === nothing if save_everystep - error("Don't use adaptive version with saveat == nothing and save_everystep = true") + len = ceil(Int, (prob.tspan[2] - prob.tspan[1]) / dt) + 1 else len = 2 end - # if tstops !== nothing - # len += length(tstops) - # end ts = allocate(backend, typeof(dt), (len, length(probs))) fill!(ts, prob.tspan[1]) us = allocate(backend, typeof(prob.u0), (len, length(probs))) else - saveat = if saveat isa AbstractRange - range(convert(eltype(prob.tspan), first(saveat)), - convert(eltype(prob.tspan), last(saveat)), - length = length(saveat)) - elseif saveat isa AbstractVector - adapt(backend, convert.(eltype(prob.tspan), saveat)) - else - prob.tspan[1]:convert(eltype(prob.tspan), saveat):prob.tspan[end] - end - ts = allocate(backend, typeof(dt), (length(saveat), length(probs))) + ts = allocate(backend, typeof(dt), (length(saveat_converted), length(probs))) fill!(ts, prob.tspan[1]) - us = allocate(backend, typeof(prob.u0), (length(saveat), length(probs))) + us = allocate(backend, typeof(prob.u0), (length(saveat_converted), length(probs))) end us = adapt(backend, us) ts = adapt(backend, ts) tstops = adapt(backend, tstops) + if saveat_converted !== nothing + saveat_converted = adapt(backend, saveat_converted) + end kernel = ode_asolve_kernel(backend) if backend isa CPU @@ -232,14 +268,9 @@ function vectorized_asolve(probs, prob::ODEProblem, alg; end kernel(probs, alg, us, ts, dt, callback, tstops, - abstol, reltol, saveat, Val(save_everystep); + abstol, reltol, saveat_converted, Val(save_everystep); ndrange = length(probs)) - # we build the actual solution object on the CPU because the GPU would create one - # containig CuDeviceArrays, which we cannot use on the host (not GC tracked, - # no useful operations, etc). That's unfortunate though, since this loop is - # generally slower than the entire GPU execution, and necessitates synchronization - #EDIT: Done when using with DiffEqGPU ts, us end diff --git a/src/ensemblegpukernel/nlsolve/utils.jl b/src/ensemblegpukernel/nlsolve/utils.jl index aa5b5a46..6044467a 100644 --- a/src/ensemblegpukernel/nlsolve/utils.jl +++ b/src/ensemblegpukernel/nlsolve/utils.jl @@ -17,7 +17,7 @@ Δz = linear_solve(W_eval, f_rhs) z_i = z_i - Δz - if norm(dt * integrator.f(tmp + γ * z_i, p, t + c * dt) - z_i) < abstol + if diffeqgpunorm(dt * integrator.f(tmp + γ * z_i, p, t + c * dt) - z_i, t) < abstol break end end diff --git a/src/ensemblegpukernel/perform_step/gpu_em_perform_step.jl b/src/ensemblegpukernel/perform_step/gpu_em_perform_step.jl index ca09c2b8..ce7971bb 100644 --- a/src/ensemblegpukernel/perform_step/gpu_em_perform_step.jl +++ b/src/ensemblegpukernel/perform_step/gpu_em_perform_step.jl @@ -1,28 +1,24 @@ @kernel function em_kernel(@Const(probs), _us, _ts, dt, saveat, ::Val{save_everystep}) where {save_everystep} i = @index(Global, Linear) - # get the actual problem for this thread prob = @inbounds probs[i] - Random.seed!(prob.seed) - # get the input/output arrays for this thread ts = @inbounds view(_ts, :, i) us = @inbounds view(_us, :, i) - _saveat = get(prob.kwargs, :saveat, nothing) - saveat = _saveat === nothing ? saveat : _saveat - f = prob.f g = prob.g u0 = prob.u0 tspan = prob.tspan p = prob.p - + + # FIX for Issue #379: Get time type from tspan + Tt = typeof(tspan[1]) + is_diagonal_noise = SciMLBase.is_diagonal_noise(prob) - cur_t = 0 if saveat !== nothing cur_t = 1 @@ -34,32 +30,33 @@ @inbounds ts[1] = tspan[1] @inbounds us[1] = u0 end - - sqdt = sqrt(dt) + + # FIX: Use Tt for sqrt to ensure proper type + sqdt = sqrt(Tt(dt)) u = copy(u0) t = copy(tspan[1]) - n = length(tspan[1]:dt:tspan[2]) - + + # FIX: Ensure n calculation uses proper types + t0, tf = tspan[1], tspan[2] + n = floor(Int, abs(tf - t0) / abs(Tt(dt))) + 1 + for j in 2:n uprev = u - if is_diagonal_noise - u = uprev + f(uprev, p, t) * dt + + u = uprev + f(uprev, p, t) * Tt(dt) + sqdt * g(uprev, p, t) .* randn(typeof(u0)) else - u = uprev + f(uprev, p, t) * dt + + u = uprev + f(uprev, p, t) * Tt(dt) + sqdt * g(uprev, p, t) * randn(typeof(prob.noise_rate_prototype[1, :])) end - - t += dt - + t += Tt(dt) if saveat === nothing && save_everystep @inbounds us[j] = u @inbounds ts[j] = t elseif saveat !== nothing while cur_t <= length(saveat) && saveat[cur_t] <= t savet = saveat[cur_t] - Θ = (savet - (t - dt)) / dt + Θ = (savet - (t - Tt(dt))) / Tt(dt) # Linear Interpolation @inbounds us[cur_t] = uprev + (u - uprev) * Θ @inbounds ts[cur_t] = savet @@ -67,9 +64,8 @@ end end end - if saveat === nothing && !save_everystep @inbounds us[2] = u @inbounds ts[2] = t end -end +end \ No newline at end of file diff --git a/src/ensemblegpukernel/perform_step/gpu_siea_perform_step.jl b/src/ensemblegpukernel/perform_step/gpu_siea_perform_step.jl index 1573c4fc..641ef9c4 100644 --- a/src/ensemblegpukernel/perform_step/gpu_siea_perform_step.jl +++ b/src/ensemblegpukernel/perform_step/gpu_siea_perform_step.jl @@ -1,29 +1,21 @@ struct SIESMEConstantCache{T, T2} α1::T α2::T - γ1::T - λ1::T λ2::T λ3::T - µ1::T µ2::T µ3::T - µ0::T2 µbar0::T2 - λ0::T λbar0::T - ν1::T ν2::T - β2::T β3::T - δ2::T δ3::T end @@ -31,32 +23,23 @@ end function SIEAConstantCache(::Type{T}, ::Type{T2}) where {T, T2} α1 = convert(T, 1 / 2) α2 = convert(T, 1 / 2) - γ1 = convert(T, 1 / 2) - λ1 = convert(T, 1 / 4) λ2 = convert(T, -1 / 4) λ3 = convert(T, 1 / 4) - µ1 = convert(T, 1 / 4) µ2 = convert(T, 1 / 4) µ3 = convert(T, -1 / 4) - µ0 = convert(T2, 1 / 1) µbar0 = convert(T2, 1 / 1) - λ0 = convert(T, 1 / 1) λbar0 = convert(T, 1 / 1) - ν1 = convert(T, 1 / 1) ν2 = convert(T, 0) - β2 = convert(T, 1 / 1) β3 = convert(T, 0) - δ2 = convert(T, -1 / 1) δ3 = convert(T, 0) - SIESMEConstantCache(α1, α2, γ1, λ1, λ2, λ3, µ1, µ2, µ3, µ0, µbar0, λ0, λbar0, ν1, ν2, β2, β3, δ2, δ3) end @@ -64,28 +47,24 @@ end @kernel function siea_kernel(@Const(probs), _us, _ts, dt, saveat, ::Val{save_everystep}) where {save_everystep} i = @index(Global, Linear) - # get the actual problem for this thread prob = @inbounds probs[i] - Random.seed!(prob.seed) - # get the input/output arrays for this thread ts = @inbounds view(_ts, :, i) us = @inbounds view(_us, :, i) - _saveat = get(prob.kwargs, :saveat, nothing) - saveat = _saveat === nothing ? saveat : _saveat - f = prob.f g = prob.g u0 = prob.u0 tspan = prob.tspan p = prob.p - + + # FIX for Issue #379: Get time type from tspan + Tt = typeof(tspan[1]) + is_diagonal_noise = SciMLBase.is_diagonal_noise(prob) - cur_t = 0 if saveat !== nothing cur_t = 1 @@ -97,52 +76,46 @@ end @inbounds ts[1] = tspan[1] @inbounds us[1] = u0 end - - @inbounds ts[1] = prob.tspan[1] - @inbounds us[1] = prob.u0 - - sqdt = sqrt(dt) + + # FIX: Use Tt for sqrt to ensure proper type + sqdt = sqrt(Tt(dt)) u = copy(u0) t = copy(tspan[1]) - n = length(tspan[1]:dt:tspan[2]) - - cache = SIEAConstantCache(eltype(u0), typeof(t)) - + + # FIX: Ensure n calculation uses proper types + t0, tf = tspan[1], tspan[2] + n = floor(Int, abs(tf - t0) / abs(Tt(dt))) + 1 + + cache = SIEAConstantCache(eltype(u0), Tt) @unpack α1, α2, γ1, λ1, λ2, λ3, µ1, µ2, µ3, µ0, µbar0, λ0, λbar0, ν1, ν2, β2, β3, δ2, δ3 = cache - + for j in 2:n uprev = u - # compute stage values k0 = f(uprev, p, t) g0 = g(uprev, p, t) - if is_diagonal_noise dW = sqdt * randn(typeof(u0)) W2 = (dW) .^ 2 / sqdt - W3 = ν2 * (dW) .^ 3 / dt - k1 = f(uprev + λ0 * k0 * dt + ν1 * g0 .* dW + g0 .* W3, p, t + µ0 * dt) - g1 = g(uprev + λbar0 * k0 * dt + β2 * g0 * sqdt + β3 * g0 .* W2, p, - t + µbar0 * dt) - g2 = g(uprev + λbar0 * k0 * dt + δ2 * g0 * sqdt + δ3 * g0 .* W2, p, - t + µbar0 * dt) - - u = uprev + (α1 * k0 + α2 * k1) * dt - + W3 = ν2 * (dW) .^ 3 / Tt(dt) + k1 = f(uprev + λ0 * k0 * Tt(dt) + ν1 * g0 .* dW + g0 .* W3, p, t + µ0 * Tt(dt)) + g1 = g(uprev + λbar0 * k0 * Tt(dt) + β2 * g0 * sqdt + β3 * g0 .* W2, p, + t + µbar0 * Tt(dt)) + g2 = g(uprev + λbar0 * k0 * Tt(dt) + δ2 * g0 * sqdt + δ3 * g0 .* W2, p, + t + µbar0 * Tt(dt)) + u = uprev + (α1 * k0 + α2 * k1) * Tt(dt) u += γ1 * g0 .* dW + (λ1 .* dW .+ λ2 * sqdt + λ3 .* W2) .* g1 + (µ1 .* dW .+ µ2 * sqdt + µ3 .* W2) .* g2 end - - t += dt - + t += Tt(dt) if saveat === nothing && save_everystep @inbounds us[j] = u @inbounds ts[j] = t elseif saveat !== nothing while cur_t <= length(saveat) && saveat[cur_t] <= t savet = saveat[cur_t] - Θ = (savet - (t - dt)) / dt + Θ = (savet - (t - Tt(dt))) / Tt(dt) # Linear Interpolation @inbounds us[cur_t] = uprev + (u - uprev) * Θ @inbounds ts[cur_t] = savet @@ -150,9 +123,8 @@ end end end end - if saveat === nothing && !save_everystep @inbounds us[2] = u @inbounds ts[2] = t end -end +end \ No newline at end of file diff --git a/src/ensemblegpukernel/perform_step/gpu_tsit5_perform_step.jl b/src/ensemblegpukernel/perform_step/gpu_tsit5_perform_step.jl index befdfae0..b754766f 100644 --- a/src/ensemblegpukernel/perform_step/gpu_tsit5_perform_step.jl +++ b/src/ensemblegpukernel/perform_step/gpu_tsit5_perform_step.jl @@ -16,7 +16,7 @@ adv_integ = true ## Check if tstops are within the range of time-series if integ.tstops !== nothing && integ.tstops_idx <= length(integ.tstops) && - (integ.tstops[integ.tstops_idx] - integ.t - integ.dt - 100 * eps(T) < 0) + (integ.tstops[integ.tstops_idx] - integ.t - integ.dt - T(100) * eps(T) < T(0)) integ.t = integ.tstops[integ.tstops_idx] ## Set correct dt dt = integ.t - integ.tprev @@ -97,8 +97,8 @@ end EEst = convert(T, Inf) - while EEst > convert(T, 1.0) - dt < convert(T, 1.0f-14) && error("dt T(1.0) + dt < T(1.0e-14) && error("dt