2929
3030# Setup initial condition as a 1-element SVector (Static Array).
3131# Using StaticArrays explicitly helps the GPU compiler generate efficient, static code.
32- u0 = @SVector [1.0f0]
32+ u0 = @SVector [1.0f0]
3333
3434# Define time span (using Float32 for GPU efficiency)
3535tspan = (0.0f0, 5.0f0)
@@ -47,7 +47,6 @@ prob = ODEProblem{false}(decay_static, u0, tspan, base_param) # Use {false} for
4747prob_func = (prob, i, repeat) -> begin
4848 new_λ = 0.5f0 + 1.0f0 * rand(Float32)
4949 remake(prob, p = @SVector [new_λ])
50-
5150end
5251
5352ensemble_prob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
@@ -69,9 +68,9 @@ if CUDA.has_cuda() && CUDA.functional()
6968 # GPUTsit5 is suitable for non-stiff ODEs on the GPU.
7069 # save_everystep=false reduces memory overhead and transfer time if only final states are needed.
7170 gpu_sol = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
72- trajectories = num_trajectories,
73- save_everystep= false, # Crucial for performance measurement
74- dt = 0.01f0, adaptive = false)
71+ trajectories = num_trajectories,
72+ save_everystep = false, # Crucial for performance measurement
73+ dt = 0.01f0, adaptive = false)
7574else
7675 @warn "CUDA not available. Skipping GPU simulation."
7776 gpu_sol = nothing
8382# Tsit5 is the CPU counterpart to GPUTsit5.
8483# Match GPU saving options for fair comparison.
8584cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
86- trajectories = num_trajectories,
87- save_everystep=false, # Match GPU setting
88- dt = 0.01f0, adaptive = false)
89-
85+ trajectories = num_trajectories,
86+ save_everystep = false, # Match GPU setting
87+ dt = 0.01f0, adaptive = false)
9088
9189```
9290
@@ -100,35 +98,36 @@ We re-run the simulations using @time to get a cleaner measurement of the execut
10098if gpu_sol_perf !== nothing
10199 @info "Timing GPU simulation (second run, no data saving)..."
102100 @time solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
103- trajectories = num_trajectories, save_everystep= false,
104- dt = 0.01f0, adaptive = false)
101+ trajectories = num_trajectories, save_everystep = false,
102+ dt = 0.01f0, adaptive = false)
105103else
106104 @info "Skipping GPU timing (CUDA not available)."
107105end
108106
109107# --- CPU Timing (Second Run) ---
110108@info "Timing CPU simulation (second run, no data saving)..."
111109@time cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
112- trajectories = num_trajectories, save_everystep= false,
113- dt = 0.01f0, adaptive = false)
110+ trajectories = num_trajectories, save_everystep = false,
111+ dt = 0.01f0, adaptive = false)
114112
115113# Note: The first @time includes compilation and setup, the second is more representative
116114# of the pure computation time for subsequent runs. Expect GPU to be significantly
117115# faster for a large number of trajectories like 10,000.
118116```
119117
120118# CPU Statistical Analysis and Visualization
119+
121120To visualize the evolution of the ensemble statistics (mean and standard deviation) over time using the * CPU results* , we need the solutions at multiple time points. We re-solve the problem on the CPU, this time saving the results at each step (save_everystep=true). We then process the results and plot them.
122121
123122``` @example decay
124123# Re-solve on CPU, saving all steps for plotting
125124@info "Re-solving CPU simulation to collect data for plotting..."
126125cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();
127- trajectories = num_trajectories,
128- save_everystep= true, # Save data at each dt step
129- dt = 0.01f0,
130- adaptive = false)
131-
126+ trajectories = num_trajectories,
127+ save_everystep = true, # Save data at each dt step
128+ dt = 0.01f0,
129+ adaptive = false)
130+
132131# Extract time points from the first trajectory's solution (assuming all are same)
133132t_vals_cpu = cpu_sol_plot[1].t
134133num_times_cpu = length(t_vals_cpu)
@@ -140,7 +139,8 @@ ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32
140139# Extract the state value (u[1]) from each trajectory at each time point
141140for i in 1:num_trajectories
142141 # Check if the trajectory simulation was successful and data looks valid
143- if cpu_sol_plot[i].retcode == ReturnCode.Success && length(cpu_sol_plot[i].u) == num_times_cpu
142+ if cpu_sol_plot[i].retcode == ReturnCode.Success &&
143+ length(cpu_sol_plot[i].u) == num_times_cpu
144144 # sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector.
145145 ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1)
146146 else
@@ -150,7 +150,8 @@ for i in 1:num_trajectories
150150end
151151
152152# Filter out failed trajectories (columns with NaN)
153- successful_traj_indices_cpu = findall(j -> !all(isnan, view(ensemble_vals_cpu, :, j)), 1:num_trajectories)
153+ successful_traj_indices_cpu = findall(
154+ j -> !all(isnan, view(ensemble_vals_cpu, :, j)), 1:num_trajectories)
154155num_successful_cpu = length(successful_traj_indices_cpu)
155156
156157if num_successful_cpu == 0
@@ -162,28 +163,30 @@ else
162163 end
163164
164165 # Compute ensemble statistics over successful CPU trajectories
165- mean_u_cpu = mapslices(mean, ensemble_vals_cpu, dims= 2)[:]
166- std_u_cpu = mapslices(std, ensemble_vals_cpu, dims= 2)[:]
166+ mean_u_cpu = mapslices(mean, ensemble_vals_cpu, dims = 2)[:]
167+ std_u_cpu = mapslices(std, ensemble_vals_cpu, dims = 2)[:]
167168
168169 # --- Plotting CPU Results ---
169- p1_cpu = plot(t_vals_cpu, mean_u_cpu, ribbon = std_u_cpu, xlabel = "Time (t)", ylabel = "u(t)",
170- title = "CPU Ensemble Mean ±1σ ($num_successful_cpu Trajectories )",
171- label = "Mean u(t)", fillalpha = 0.3, lw=2)
172-
170+ p1_cpu = plot(
171+ t_vals_cpu, mean_u_cpu, ribbon = std_u_cpu, xlabel = "Time (t)", ylabel = "u(t )",
172+ title = "CPU Ensemble Mean ±1σ ($num_successful_cpu Trajectories)",
173+ label = "Mean u(t)", fillalpha = 0.3, lw = 2)
173174
174175 final_vals_cpu = ensemble_vals_cpu[end, :]
175- p2_cpu = histogram(final_vals_cpu, bins = 30, normalize= :probability,
176- xlabel = "Final u(T)", ylabel = "Probability Density",
177- title = "CPU Distribution of Final Values (t=$(tspan[2]))",
178- label = "", legend= false)
176+ p2_cpu = histogram(final_vals_cpu, bins = 30, normalize = :probability,
177+ xlabel = "Final u(T)", ylabel = "Probability Density",
178+ title = "CPU Distribution of Final Values (t=$(tspan[2]))",
179+ label = "", legend = false)
179180
180- plot_cpu = plot(p1_cpu, p2_cpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
181+ plot_cpu = plot(
182+ p1_cpu, p2_cpu, layout = (1, 2), size = (1000, 450), legend = :outertopright)
181183 @info "Displaying CPU analysis plot..."
182184 display(plot_cpu)
183185end
184186```
185187
186188# GPU Statistical Analysis and Visualization
189+
187190Similarly, we can analyze the results from the * GPU simulation* . This requires re-running the simulation to save the time series data and then transferring the data from the GPU memory to the CPU RAM for analysis and plotting using standard tools. Note that this data transfer can be a significant bottleneck for large numbers of trajectories or time steps.
188191
189192``` @example decay
@@ -192,11 +195,12 @@ gpu_analysis_plot = nothing # Initialize plot variable
192195if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
193196 @info "Re-solving GPU simulation to collect data for plotting..."
194197 # Add @time to see the impact of saving data
195- @time gpu_sol_plot = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
196- trajectories = num_trajectories,
197- save_everystep=true, # <<-- Save data at each dt step on GPU
198- dt = 0.01f0,
199- adaptive = false)
198+ @time gpu_sol_plot = solve(
199+ ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
200+ trajectories = num_trajectories,
201+ save_everystep = true, # <<-- Save data at each dt step on GPU
202+ dt = 0.01f0,
203+ adaptive = false)
200204
201205 # --- Data Transfer and Analysis ---
202206 # The result gpu_sol_plot should be an EnsembleSolution containing a Vector{ODESolution}
@@ -213,14 +217,15 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
213217 # Note: This might be slow/memory intensive!
214218 @time solutions_vector = Array(gpu_sol_plot)
215219 if !(solutions_vector isa AbstractVector{<:ODESolution})
216- @error "Could not obtain Vector{ODESolution} from GPU result. Type is $(typeof(solutions_vector)). Aborting GPU analysis."
217- solutions_vector = nothing # Mark as failed
220+ @error "Could not obtain Vector{ODESolution} from GPU result. Type is $(typeof(solutions_vector)). Aborting GPU analysis."
221+ solutions_vector = nothing # Mark as failed
218222 end
219223 end
220224
221225 if solutions_vector !== nothing
222226 # Extract time points from the first successful trajectory's solution
223- first_successful_gpu_idx = findfirst(sol -> sol.retcode == ReturnCode.Success, solutions_vector)
227+ first_successful_gpu_idx = findfirst(
228+ sol -> sol.retcode == ReturnCode.Success, solutions_vector)
224229
225230 if first_successful_gpu_idx === nothing
226231 @error "No successful GPU trajectories found in the returned solutions vector!"
@@ -236,15 +241,15 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
236241 for i in 1:num_trajectories
237242 sol = solutions_vector[i] # Access the i-th ODESolution
238243 if sol.retcode == ReturnCode.Success
239- # Check consistency of time points (optional but good)
240- if length(sol.t) == num_times_gpu # && sol.t == t_vals_gpu (can be slow check)
244+ # Check consistency of time points (optional but good)
245+ if length(sol.t) == num_times_gpu # && sol.t == t_vals_gpu (can be slow check)
241246 # sol.u is likely Vector{SVector{1, Float32}} after transfer
242247 ensemble_vals_gpu[:, i] .= getindex.(sol.u, 1)
243248 num_processed += 1
244- else
245- @warn "GPU Trajectory $i succeeded but time points mismatch (Expected $(num_times_gpu), Got $(length(sol.t))). Skipping."
246- # Column remains NaN
247- end
249+ else
250+ @warn "GPU Trajectory $i succeeded but time points mismatch (Expected $(num_times_gpu), Got $(length(sol.t))). Skipping."
251+ # Column remains NaN
252+ end
248253 else
249254 # @warn "GPU Trajectory $i failed with retcode: $(sol.retcode). Skipping." # Potentially verbose
250255 # Column remains NaN
@@ -253,7 +258,8 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
253258 @info "Processed $num_processed successful GPU trajectories."
254259
255260 # Filter out failed trajectories (columns with NaN)
256- successful_traj_indices_gpu = findall(j -> !all(isnan, view(ensemble_vals_gpu, :, j)), 1:num_trajectories)
261+ successful_traj_indices_gpu = findall(
262+ j -> !all(isnan, view(ensemble_vals_gpu, :, j)), 1:num_trajectories)
257263 num_successful_gpu = length(successful_traj_indices_gpu)
258264
259265 if num_successful_gpu == 0
@@ -267,27 +273,29 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
267273 end
268274
269275 # Compute ensemble statistics over successful GPU trajectories
270- mean_u_gpu = mapslices(mean, ensemble_vals_gpu, dims= 2)[:]
271- std_u_gpu = mapslices(std, ensemble_vals_gpu, dims= 2)[:]
276+ mean_u_gpu = mapslices(mean, ensemble_vals_gpu, dims = 2)[:]
277+ std_u_gpu = mapslices(std, ensemble_vals_gpu, dims = 2)[:]
272278
273279 # --- Plotting GPU Results ---
274- p1_gpu = plot(t_vals_gpu, mean_u_gpu, ribbon = std_u_gpu, xlabel = "Time (t)", ylabel = "u(t)",
275- title = "GPU Ensemble Mean ±1σ ($num_successful_gpu Trajectories)",
276- label = "Mean u(t)", fillalpha = 0.3, lw=2)
280+ p1_gpu = plot(t_vals_gpu, mean_u_gpu, ribbon = std_u_gpu,
281+ xlabel = "Time (t)", ylabel = "u(t)",
282+ title = "GPU Ensemble Mean ±1σ ($num_successful_gpu Trajectories)",
283+ label = "Mean u(t)", fillalpha = 0.3, lw = 2)
277284
278285 final_vals_gpu = ensemble_vals_gpu[end, :]
279- p2_gpu = histogram(final_vals_gpu, bins = 30, normalize= :probability,
280- xlabel = "Final u(T)", ylabel = "Probability Density",
281- title = "GPU Distribution of Final Values (t=$(tspan[2]))",
282- label = "", legend= false)
286+ p2_gpu = histogram(final_vals_gpu, bins = 30, normalize = :probability,
287+ xlabel = "Final u(T)", ylabel = "Probability Density",
288+ title = "GPU Distribution of Final Values (t=$(tspan[2]))",
289+ label = "", legend = false)
283290
284- gpu_analysis_plot = plot(p1_gpu, p2_gpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
291+ gpu_analysis_plot = plot(p1_gpu, p2_gpu, layout = (1, 2),
292+ size = (1000, 450), legend = :outertopright)
285293 @info "Displaying GPU analysis plot..."
286294 display(gpu_analysis_plot)
287295
288296 # Cleanup large structures if memory is a concern
289- ensemble_vals_gpu = nothing
290- solutions_vector = nothing
297+ ensemble_vals_gpu = nothing
298+ solutions_vector = nothing
291299 # gc()
292300 end
293301 end
296304 @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
297305end
298306```
299-
0 commit comments