From 20112426b9e7e92d3db3808092c5891ec32592a7 Mon Sep 17 00:00:00 2001
From: ParyaRoustaee <ParyaRoustaee>
Date: Fri, 25 Apr 2025 13:18:35 -0700
Subject: [PATCH 01/12] updated the random_decay tutorial  with StaticArrays
 and improve benchmarking and plots

---
 .../src/examples/gpu_ensemble_random_decay.md | 277 +++++++++++++++---
 1 file changed, 235 insertions(+), 42 deletions(-)
diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 42e7bcf8..5f8257f6 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -2,37 +2,52 @@
 
 In this tutorial, we demonstrate how to perform GPU-accelerated ensemble simulations using DiffEqGPU.jl. We model an exponential decay ODE:
 \[ u'(t) = -\lambda \, u(t) \]
-with the twist that each trajectory uses a random decay rate \(\lambda\) sampled uniformly from \([0.5, 1.5]\).
+with the twist that each trajectory uses a random decay rate \(\lambda\) sampled uniformly from \([0.5, 1.5]\). This version uses `StaticArrays` for the state vector, which is often more robust and performant for small ODEs on the GPU.
 
 ## Setup
 
-We first define the ODE and set up an `EnsembleProblem` that randomizes the decay rate for each trajectory.
+We first load the necessary packages, define the ODE using `StaticArrays`, and set up an `EnsembleProblem` that randomizes the decay rate for each trajectory.
 
 ```julia
 # Make sure you have the necessary packages installed
 # using Pkg
-# Pkg.add(["OrdinaryDiffEq", "DiffEqGPU", "CUDA", "Random", "Statistics", "Plots"])
+# Pkg.add(["OrdinaryDiffEq", "DiffEqGPU", "CUDA",  "StaticArrays", "Random", "Statistics", "Plots"])
 # # Depending on your system, you might need to configure CUDA.jl:
 # # import Pkg; Pkg.build("CUDA")
-using OrdinaryDiffEq, DiffEqGPU, CUDA, Random, Statistics, Plots
+using OrdinaryDiffEq, DiffEqGPU, CUDA, StaticArrays, Random, Statistics, Plots
 
 # Set a random seed for reproducibility
 Random.seed!(123)
 
-# Define the decay ODE: du/dt = -λ * u, with initial value u(0) = 1.
-decay(u, p, t) = -p * u
+# Define the decay ODE using the OUT-OF-PLACE form for StaticArrays:
+# f(u, p, t) should return a new SVector representing the derivative du/dt.
+# This form is generally preferred for StaticArrays on the GPU.
+function decay_static(u::SVector, p, t)
+    λ = p[1] # Parameter is expected as a scalar or single-element container
+    return @SVector [-λ * u[1]]
+end
+
+# Setup initial condition as a 1-element SVector (Static Array).
+# Using StaticArrays explicitly helps the GPU compiler generate efficient, static code.
+u0    = @SVector [1.0f0]
 
-# Setup initial condition and time span (using Float32 for GPU efficiency)
-u0    = 1.0f0
+# Define time span (using Float32 for GPU efficiency)
 tspan = (0.0f0, 5.0f0)
-base_param = 1.0f0
-prob = ODEProblem(decay, u0, tspan, base_param)
+
+# Define the base parameter (will be overridden by prob_func)
+# We wrap it in an SVector to match how parameters might be handled internally,
+# though a scalar Float32 often works too. Using SVector can sometimes avoid type issues.
+base_param = @SVector [1.0f0]
+
+# Create the ODEProblem using the static function and SVector initial condition
+prob = ODEProblem{false}(decay_static, u0, tspan, base_param) # Use {false} for out-of-place
 
 # Define a probability function that randomizes λ for each ensemble member.
 # Each trajectory's λ is sampled uniformly from [0.5, 1.5].
 prob_func = (prob, i, repeat) -> begin
-    new_λ = 0.5f0 + 1.0f0 * rand()
-    remake(prob, p = new_λ)
+    new_λ = 0.5f0 + 1.0f0 * rand(Float32)
+    remake(prob, p = @SVector [new_λ])
+
 end
 
 ensemble_prob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
@@ -40,68 +55,246 @@ ensemble_prob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 
 # Solving on GPU and CPU
 
-Here we solve the ensemble problem on both GPU and CPU. We use 10,000 trajectories with a fixed time step to facilitate performance comparison.
+Here we solve the ensemble problem on both GPU and CPU. We use 10,000 trajectories with a fixed time step to facilitate performance comparison. For performance benchmarking, we initially solve without saving every step (save_everystep=false).
 
 ```julia
 # Number of trajectories
 num_trajectories = 10_000
 
-# Solve on GPU (check for CUDA availability)
-if CUDA.has_cuda()
-    @info "Running GPU simulation..."
+# --- GPU Simulation ---
+if CUDA.has_cuda() && CUDA.functional()
+    @info "Running GPU simulation (initial run for performance, includes compilation)..."
+    # Use EnsembleGPUKernel with the CUDABackend.
+    # GPUTsit5 is suitable for non-stiff ODEs on the GPU.
+    # save_everystep=false reduces memory overhead and transfer time if only final states are needed.
     gpu_sol = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
-                    trajectories = num_trajectories, dt = 0.01f0, adaptive = false)
+                    trajectories = num_trajectories, 
+                    save_everystep=false, # Crucial for performance measurement
+                    dt = 0.01f0, adaptive = false)
 else
     @warn "CUDA not available. Skipping GPU simulation."
     gpu_sol = nothing
 end
 
-# Solve on CPU using multi-threading
-@info "Running CPU simulation..."
+# --- CPU Simulation ---
+@info "Running CPU simulation (initial run for performance, includes compilation)..."
+# Use EnsembleThreads for multi-threaded CPU execution.
+# Tsit5 is the CPU counterpart to GPUTsit5.
+# Match GPU saving options for fair comparison.
 cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
-                trajectories = num_trajectories, dt = 0.01f0, adaptive = false)
+                trajectories = num_trajectories, 
+                save_everystep=false, # Match GPU setting
+                dt = 0.01f0, adaptive = false)
 
 
 ```
 
 # Performance Comparison
 
-We measure the performance of each simulation. (Note: The first run may include compilation time.)
+We re-run the simulations using @time to get a cleaner measurement of the execution time, excluding the initial compilation overhead.
 
 ```julia
 
-# Warm-up (first run) for GPU if applicable
-if gpu_sol !== nothing
+# --- GPU Timing (Second Run) ---
+if gpu_sol_perf !== nothing
+    @info "Timing GPU simulation (second run, no data saving)..."
     @time solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
-                trajectories = num_trajectories, dt = 0.01f0, adaptive = false)
+                trajectories = num_trajectories, save_everystep=false,
+                dt = 0.01f0, adaptive = false)
+else
+    @info "Skipping GPU timing (CUDA not available)."
 end
 
+# --- CPU Timing (Second Run) ---
+@info "Timing CPU simulation (second run, no data saving)..."
 @time cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
-                      trajectories = num_trajectories, dt = 0.01f0, adaptive = false)
+                      trajectories = num_trajectories, save_everystep=false,
+                      dt = 0.01f0, adaptive = false)
+
+# Note: The first @time includes compilation and setup, the second is more representative
+# of the pure computation time for subsequent runs. Expect GPU to be significantly
+# faster for a large number of trajectories like 10,000.
+```
+
+# CPU Statistical Analysis and Visualization
+To visualize the evolution of the ensemble statistics (mean and standard deviation) over time using the *CPU results*, we need the solutions at multiple time points. We re-solve the problem on the CPU, this time saving the results at each step (save_everystep=true). We then process the results and plot them.
+
+```julia
+# Re-solve on CPU, saving all steps for plotting
+@info "Re-solving CPU simulation to collect data for plotting..."
+cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();
+                     trajectories = num_trajectories,
+                     save_everystep=true, # Save data at each dt step
+                     dt = 0.01f0,
+                     adaptive = false)
+                     
+# Extract time points from the first trajectory's solution (assuming all are same)
+t_vals_cpu = cpu_sol_plot[1].t
+num_times_cpu = length(t_vals_cpu)
+
+# Create a matrix to hold the results: rows=time, columns=trajectories
+# Initialize with NaN in case some trajectories fail
+ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32
+
+# Extract the state value (u[1]) from each trajectory at each time point
+for i in 1:num_trajectories
+    # Check if the trajectory simulation was successful and data looks valid
+    if cpu_sol_plot[i].retcode == ReturnCode.Success && length(cpu_sol_plot[i].u) == num_times_cpu
+        # sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector.
+        ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1)
+    else
+        @warn "CPU Trajectory $i failed or had unexpected length. Retcode: $(cpu_sol_plot[i].retcode). Length: $(length(cpu_sol_plot[i].u)). Skipping."
+        # Column remains NaN
+    end
+end
+
+# Filter out failed trajectories (columns with NaN)
+successful_traj_indices_cpu = findall(j -> !all(isnan, view(ensemble_vals_cpu, :, j)), 1:num_trajectories)
+num_successful_cpu = length(successful_traj_indices_cpu)
+
+if num_successful_cpu == 0
+    @error "No successful CPU trajectories to analyze!"
+else
+    if num_successful_cpu < num_trajectories
+        @warn "$(num_trajectories - num_successful_cpu) CPU trajectories failed. Analysis based on $num_successful_cpu trajectories."
+        ensemble_vals_cpu = ensemble_vals_cpu[:, successful_traj_indices_cpu] # Keep only successful ones
+    end
+
+    # Compute ensemble statistics over successful CPU trajectories
+    mean_u_cpu = mapslices(mean, ensemble_vals_cpu, dims=2)[:]
+    std_u_cpu  = mapslices(std, ensemble_vals_cpu, dims=2)[:]
+
+    # --- Plotting CPU Results ---
+    p1_cpu = plot(t_vals_cpu, mean_u_cpu, ribbon = std_u_cpu, xlabel = "Time (t)", ylabel = "u(t)",
+                  title = "CPU Ensemble Mean ±1σ ($num_successful_cpu Trajectories)",
+                  label = "Mean u(t)", fillalpha = 0.3, lw=2)
+   
+
+    final_vals_cpu = ensemble_vals_cpu[end, :]
+    p2_cpu = histogram(final_vals_cpu, bins = 30, normalize=:probability,
+                       xlabel = "Final u(T)", ylabel = "Probability Density",
+                       title = "CPU Distribution of Final Values (t=$(tspan[2]))",
+                       label = "", legend=false)
+
+    plot_cpu = plot(p1_cpu, p2_cpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
+    @info "Displaying CPU analysis plot..."
+    display(plot_cpu)
+end
 ```
 
-# Statistical Analysis and Visualization
-We analyze the ensemble by computing the mean and standard deviation of u(t)u(t) across trajectories, and then visualize the results.
+# GPU Statistical Analysis and Visualization
+Similarly, we can analyze the results from the *GPU simulation*. This requires re-running the simulation to save the time series data and then transferring the data from the GPU memory to the CPU RAM for analysis and plotting using standard tools. Note that this data transfer can be a significant bottleneck for large numbers of trajectories or time steps.
 
 ```julia
+# Check if GPU simulation was successful initially before proceeding
+gpu_analysis_plot = nothing # Initialize plot variable
+if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
+    @info "Re-solving GPU simulation to collect data for plotting..."
+    # Add @time to see the impact of saving data
+    @time gpu_sol_plot = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
+                           trajectories = num_trajectories,
+                           save_everystep=true, # <<-- Save data at each dt step on GPU
+                           dt = 0.01f0,
+                           adaptive = false)
+
+    # --- Data Transfer and Analysis ---
+    # The result gpu_sol_plot should be an EnsembleSolution containing a Vector{ODESolution}
+    # Accessing it might implicitly transfer, or we can use Array()
+    @info "Transferring GPU solution objects (if needed) and processing..."
+    # Let's try accessing .u directly first, assuming it holds the Vector{ODESolution}
+    # If this fails, we might try Array(gpu_sol_plot) -> Vector{ODESolution}
+    solutions_vector = gpu_sol_plot.u
+
+    # Check if the transfer actually happened / if we have the right type
+    if !(solutions_vector isa AbstractVector{<:ODESolution})
+        @warn "gpu_sol_plot.u is not a Vector{ODESolution}. Trying Array(gpu_sol_plot)..."
+        # This might explicitly trigger the transfer and construction of ODESolution objects on CPU
+        # Note: This might be slow/memory intensive!
+        @time solutions_vector = Array(gpu_sol_plot)
+        if !(solutions_vector isa AbstractVector{<:ODESolution})
+             @error "Could not obtain Vector{ODESolution} from GPU result. Type is $(typeof(solutions_vector)). Aborting GPU analysis."
+             solutions_vector = nothing # Mark as failed
+        end
+    end
+
+    if solutions_vector !== nothing
+        # Extract time points from the first successful trajectory's solution
+        first_successful_gpu_idx = findfirst(sol -> sol.retcode == ReturnCode.Success, solutions_vector)
+
+        if first_successful_gpu_idx === nothing
+            @error "No successful GPU trajectories found in the returned solutions vector!"
+        else
+            t_vals_gpu = solutions_vector[first_successful_gpu_idx].t
+            num_times_gpu = length(t_vals_gpu)
 
-# Assuming all solutions have the same time points (fixed dt & saveat)
-t_vals = cpu_sol[1].t
-num_times = length(t_vals)
-ensemble_vals = reduce(hcat, [sol.u for sol in cpu_sol])  # each column corresponds to one trajectory
+            # Create a matrix to hold the results from GPU (now on CPU)
+            ensemble_vals_gpu = fill(NaN32, num_times_gpu, num_trajectories) # Use Float32
 
-# Compute ensemble statistics
-mean_u = [mean(ensemble_vals[i, :]) for i in 1:num_times]
-std_u  = [std(ensemble_vals[i, :]) for i in 1:num_times]
+            # Extract the state value (u[1]) from each trajectory
+            num_processed = 0
+            for i in 1:num_trajectories
+                sol = solutions_vector[i] # Access the i-th ODESolution
+                if sol.retcode == ReturnCode.Success
+                   # Check consistency of time points (optional but good)
+                   if length(sol.t) == num_times_gpu # && sol.t == t_vals_gpu (can be slow check)
+                        # sol.u is likely Vector{SVector{1, Float32}} after transfer
+                        ensemble_vals_gpu[:, i] .= getindex.(sol.u, 1)
+                        num_processed += 1
+                   else
+                       @warn "GPU Trajectory $i succeeded but time points mismatch (Expected $(num_times_gpu), Got $(length(sol.t))). Skipping."
+                       # Column remains NaN
+                   end
+                else
+                    # @warn "GPU Trajectory $i failed with retcode: $(sol.retcode). Skipping." # Potentially verbose
+                    # Column remains NaN
+                end
+            end
+            @info "Processed $num_processed successful GPU trajectories."
 
-# Plot the mean trajectory with ±1 standard deviation
-p1 = plot(t_vals, mean_u, ribbon = std_u, xlabel = "Time", ylabel = "u(t)",
-          title = "Ensemble Mean and ±1σ", label = "Mean ± σ", legend = :topright)
+            # Filter out failed trajectories (columns with NaN)
+            successful_traj_indices_gpu = findall(j -> !all(isnan, view(ensemble_vals_gpu, :, j)), 1:num_trajectories)
+            num_successful_gpu = length(successful_traj_indices_gpu)
+
+            if num_successful_gpu == 0
+                @error "No successful GPU trajectories suitable for analysis after processing!"
+            else
+                if num_successful_gpu < num_trajectories
+                    # This count includes those skipped due to time mismatch or failure
+                    @warn "Analysis based on $num_successful_gpu trajectories (out of $num_trajectories initial)."
+                    # Keep only successful, valid ones
+                    ensemble_vals_gpu = ensemble_vals_gpu[:, successful_traj_indices_gpu]
+                end
+
+                # Compute ensemble statistics over successful GPU trajectories
+                mean_u_gpu = mapslices(mean, ensemble_vals_gpu, dims=2)[:]
+                std_u_gpu  = mapslices(std, ensemble_vals_gpu, dims=2)[:]
+
+                # --- Plotting GPU Results ---
+                p1_gpu = plot(t_vals_gpu, mean_u_gpu, ribbon = std_u_gpu, xlabel = "Time (t)", ylabel = "u(t)",
+                              title = "GPU Ensemble Mean ±1σ ($num_successful_gpu Trajectories)",
+                              label = "Mean u(t)", fillalpha = 0.3, lw=2)
+
+                final_vals_gpu = ensemble_vals_gpu[end, :]
+                p2_gpu = histogram(final_vals_gpu, bins = 30, normalize=:probability,
+                                   xlabel = "Final u(T)", ylabel = "Probability Density",
+                                   title = "GPU Distribution of Final Values (t=$(tspan[2]))",
+                                   label = "", legend=false)
+
+                gpu_analysis_plot = plot(p1_gpu, p2_gpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
+                @info "Displaying GPU analysis plot..."
+                display(gpu_analysis_plot)
+
+                # Cleanup large structures if memory is a concern
+                 ensemble_vals_gpu = nothing
+                 solutions_vector = nothing
+                # gc()
+            end
+        end
+    end # End if solutions_vector !== nothing
+else
+    @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
+end
 
-# Histogram of final values (u at t=5)
-final_vals = ensemble_vals[end, :]
-p2 = histogram(final_vals, bins = 30, xlabel = "Final u", ylabel = "Frequency",
-               title = "Distribution of Final Values", label = "")
-plot(p1, p2, layout = (1,2), size = (900,400))
 
+@info "Tutorial Finished."
 

From 59805b77cb740503269b6ce2b7b17349ccdbe226 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:07:45 -0400
Subject: [PATCH 02/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 5f8257f6..d4e25235 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -8,7 +8,7 @@ with the twist that each trajectory uses a random decay rate \(\lambda\) sampled
 
 We first load the necessary packages, define the ODE using `StaticArrays`, and set up an `EnsembleProblem` that randomizes the decay rate for each trajectory.
 
-```julia
+```@example decay
 # Make sure you have the necessary packages installed
 # using Pkg
 # Pkg.add(["OrdinaryDiffEq", "DiffEqGPU", "CUDA",  "StaticArrays", "Random", "Statistics", "Plots"])

From 8b7f00b64fc997e6914afd4d786985da78f8b78d Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:07:50 -0400
Subject: [PATCH 03/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index d4e25235..1f4a6488 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -57,7 +57,7 @@ ensemble_prob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 
 Here we solve the ensemble problem on both GPU and CPU. We use 10,000 trajectories with a fixed time step to facilitate performance comparison. For performance benchmarking, we initially solve without saving every step (save_everystep=false).
 
-```julia
+```@example decay
 # Number of trajectories
 num_trajectories = 10_000
 

From 3a5e11a8e67857107703bd5768b28568870e1a47 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:07:57 -0400
Subject: [PATCH 04/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 1f4a6488..20f78eef 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -185,7 +185,7 @@ end
 # GPU Statistical Analysis and Visualization
 Similarly, we can analyze the results from the *GPU simulation*. This requires re-running the simulation to save the time series data and then transferring the data from the GPU memory to the CPU RAM for analysis and plotting using standard tools. Note that this data transfer can be a significant bottleneck for large numbers of trajectories or time steps.
 
-```julia
+```@example decay
 # Check if GPU simulation was successful initially before proceeding
 gpu_analysis_plot = nothing # Initialize plot variable
 if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()

From e213c5fc2a707f43f8315ad3559f75ac578e8a8e Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:08:03 -0400
Subject: [PATCH 05/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 20f78eef..9371e853 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -119,7 +119,7 @@ end
 # CPU Statistical Analysis and Visualization
 To visualize the evolution of the ensemble statistics (mean and standard deviation) over time using the *CPU results*, we need the solutions at multiple time points. We re-solve the problem on the CPU, this time saving the results at each step (save_everystep=true). We then process the results and plot them.
 
-```julia
+```@example decay
 # Re-solve on CPU, saving all steps for plotting
 @info "Re-solving CPU simulation to collect data for plotting..."
 cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();

From 54a75515f9af9e11ceb495d792a84a1bebc8b932 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:08:09 -0400
Subject: [PATCH 06/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 9371e853..043a9db6 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -295,6 +295,5 @@ else
     @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
 end
 
-
 @info "Tutorial Finished."
 

From 0a5c24c571515ef00cd85c7d434e7df7d7c6115b Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:08:16 -0400
Subject: [PATCH 07/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 043a9db6..d175f3c5 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -295,5 +295,4 @@ else
     @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
 end
 
-@info "Tutorial Finished."
 

From b852325a212218c5dfc3b92425681bc48aef3365 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:09:01 -0400
Subject: [PATCH 08/12] Update docs/src/examples/gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index d175f3c5..3590fecf 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -93,7 +93,7 @@ cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
 
 We re-run the simulations using @time to get a cleaner measurement of the execution time, excluding the initial compilation overhead.
 
-```julia
+```@example decay
 
 # --- GPU Timing (Second Run) ---
 if gpu_sol_perf !== nothing

From 4dfe7ac7dd3d0d2b3fc1930eb1785751f9db3f44 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Fri, 25 Apr 2025 17:09:25 -0400
Subject: [PATCH 09/12] Update gpu_ensemble_random_decay.md

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 3590fecf..404e5e2a 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -294,5 +294,5 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
 else
     @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
 end
-
+```
 

From cd8e3402738c2b94d6234fcc9eedf3101f5feaeb Mon Sep 17 00:00:00 2001
From: ParyaRoustaee <ParyaRoustaee>
Date: Fri, 25 Apr 2025 14:42:30 -0700
Subject: [PATCH 10/12] a line was accidentally removed before which is added
 back

---
 docs/src/examples/gpu_ensemble_random_decay.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index 404e5e2a..ce9c2a60 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -62,6 +62,7 @@ Here we solve the ensemble problem on both GPU and CPU. We use 10,000 trajectori
 num_trajectories = 10_000
 
 # --- GPU Simulation ---
+gpu_sol_perf = nothing # Initialize variable for performance run
 if CUDA.has_cuda() && CUDA.functional()
     @info "Running GPU simulation (initial run for performance, includes compilation)..."
     # Use EnsembleGPUKernel with the CUDABackend.

From b61b85bf56c7e1277d2ac45e459e7ef54705a6a7 Mon Sep 17 00:00:00 2001
From: ParyaRoustaee <ParyaRoustaee>
Date: Fri, 25 Apr 2025 17:51:29 -0700
Subject: [PATCH 11/12] applying JuliaFormatter

---
 .../src/examples/gpu_ensemble_random_decay.md | 125 +++++++++---------
 src/algorithms.jl                             |   3 +-
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index ce9c2a60..b195faf6 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -29,7 +29,7 @@ end
 
 # Setup initial condition as a 1-element SVector (Static Array).
 # Using StaticArrays explicitly helps the GPU compiler generate efficient, static code.
-u0    = @SVector [1.0f0]
+u0 = @SVector [1.0f0]
 
 # Define time span (using Float32 for GPU efficiency)
 tspan = (0.0f0, 5.0f0)
@@ -47,7 +47,6 @@ prob = ODEProblem{false}(decay_static, u0, tspan, base_param) # Use {false} for
 prob_func = (prob, i, repeat) -> begin
     new_λ = 0.5f0 + 1.0f0 * rand(Float32)
     remake(prob, p = @SVector [new_λ])
-
 end
 
 ensemble_prob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
@@ -69,9 +68,9 @@ if CUDA.has_cuda() && CUDA.functional()
     # GPUTsit5 is suitable for non-stiff ODEs on the GPU.
     # save_everystep=false reduces memory overhead and transfer time if only final states are needed.
     gpu_sol = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
-                    trajectories = num_trajectories, 
-                    save_everystep=false, # Crucial for performance measurement
-                    dt = 0.01f0, adaptive = false)
+        trajectories = num_trajectories,
+        save_everystep = false, # Crucial for performance measurement
+        dt = 0.01f0, adaptive = false)
 else
     @warn "CUDA not available. Skipping GPU simulation."
     gpu_sol = nothing
@@ -83,10 +82,9 @@ end
 # Tsit5 is the CPU counterpart to GPUTsit5.
 # Match GPU saving options for fair comparison.
 cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
-                trajectories = num_trajectories, 
-                save_everystep=false, # Match GPU setting
-                dt = 0.01f0, adaptive = false)
-
+    trajectories = num_trajectories,
+    save_everystep = false, # Match GPU setting
+    dt = 0.01f0, adaptive = false)
 
 ```
 
@@ -100,8 +98,8 @@ We re-run the simulations using @time to get a cleaner measurement of the execut
 if gpu_sol_perf !== nothing
     @info "Timing GPU simulation (second run, no data saving)..."
     @time solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
-                trajectories = num_trajectories, save_everystep=false,
-                dt = 0.01f0, adaptive = false)
+        trajectories = num_trajectories, save_everystep = false,
+        dt = 0.01f0, adaptive = false)
 else
     @info "Skipping GPU timing (CUDA not available)."
 end
@@ -109,8 +107,8 @@ end
 # --- CPU Timing (Second Run) ---
 @info "Timing CPU simulation (second run, no data saving)..."
 @time cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
-                      trajectories = num_trajectories, save_everystep=false,
-                      dt = 0.01f0, adaptive = false)
+    trajectories = num_trajectories, save_everystep = false,
+    dt = 0.01f0, adaptive = false)
 
 # Note: The first @time includes compilation and setup, the second is more representative
 # of the pure computation time for subsequent runs. Expect GPU to be significantly
@@ -118,17 +116,18 @@ end
 ```
 
 # CPU Statistical Analysis and Visualization
+
 To visualize the evolution of the ensemble statistics (mean and standard deviation) over time using the *CPU results*, we need the solutions at multiple time points. We re-solve the problem on the CPU, this time saving the results at each step (save_everystep=true). We then process the results and plot them.
 
 ```@example decay
 # Re-solve on CPU, saving all steps for plotting
 @info "Re-solving CPU simulation to collect data for plotting..."
 cpu_sol_plot = solve(ensemble_prob, Tsit5(), EnsembleThreads();
-                     trajectories = num_trajectories,
-                     save_everystep=true, # Save data at each dt step
-                     dt = 0.01f0,
-                     adaptive = false)
-                     
+    trajectories = num_trajectories,
+    save_everystep = true, # Save data at each dt step
+    dt = 0.01f0,
+    adaptive = false)
+
 # Extract time points from the first trajectory's solution (assuming all are same)
 t_vals_cpu = cpu_sol_plot[1].t
 num_times_cpu = length(t_vals_cpu)
@@ -140,7 +139,8 @@ ensemble_vals_cpu = fill(NaN32, num_times_cpu, num_trajectories) # Use Float32
 # Extract the state value (u[1]) from each trajectory at each time point
 for i in 1:num_trajectories
     # Check if the trajectory simulation was successful and data looks valid
-    if cpu_sol_plot[i].retcode == ReturnCode.Success && length(cpu_sol_plot[i].u) == num_times_cpu
+    if cpu_sol_plot[i].retcode == ReturnCode.Success &&
+       length(cpu_sol_plot[i].u) == num_times_cpu
         # sol.u is a Vector{SVector{1, Float32}}. We need the element from each SVector.
         ensemble_vals_cpu[:, i] .= getindex.(cpu_sol_plot[i].u, 1)
     else
@@ -150,7 +150,8 @@ for i in 1:num_trajectories
 end
 
 # Filter out failed trajectories (columns with NaN)
-successful_traj_indices_cpu = findall(j -> !all(isnan, view(ensemble_vals_cpu, :, j)), 1:num_trajectories)
+successful_traj_indices_cpu = findall(
+    j -> !all(isnan, view(ensemble_vals_cpu, :, j)), 1:num_trajectories)
 num_successful_cpu = length(successful_traj_indices_cpu)
 
 if num_successful_cpu == 0
@@ -162,28 +163,30 @@ else
     end
 
     # Compute ensemble statistics over successful CPU trajectories
-    mean_u_cpu = mapslices(mean, ensemble_vals_cpu, dims=2)[:]
-    std_u_cpu  = mapslices(std, ensemble_vals_cpu, dims=2)[:]
+    mean_u_cpu = mapslices(mean, ensemble_vals_cpu, dims = 2)[:]
+    std_u_cpu = mapslices(std, ensemble_vals_cpu, dims = 2)[:]
 
     # --- Plotting CPU Results ---
-    p1_cpu = plot(t_vals_cpu, mean_u_cpu, ribbon = std_u_cpu, xlabel = "Time (t)", ylabel = "u(t)",
-                  title = "CPU Ensemble Mean ±1σ ($num_successful_cpu Trajectories)",
-                  label = "Mean u(t)", fillalpha = 0.3, lw=2)
-   
+    p1_cpu = plot(
+        t_vals_cpu, mean_u_cpu, ribbon = std_u_cpu, xlabel = "Time (t)", ylabel = "u(t)",
+        title = "CPU Ensemble Mean ±1σ ($num_successful_cpu Trajectories)",
+        label = "Mean u(t)", fillalpha = 0.3, lw = 2)
 
     final_vals_cpu = ensemble_vals_cpu[end, :]
-    p2_cpu = histogram(final_vals_cpu, bins = 30, normalize=:probability,
-                       xlabel = "Final u(T)", ylabel = "Probability Density",
-                       title = "CPU Distribution of Final Values (t=$(tspan[2]))",
-                       label = "", legend=false)
+    p2_cpu = histogram(final_vals_cpu, bins = 30, normalize = :probability,
+        xlabel = "Final u(T)", ylabel = "Probability Density",
+        title = "CPU Distribution of Final Values (t=$(tspan[2]))",
+        label = "", legend = false)
 
-    plot_cpu = plot(p1_cpu, p2_cpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
+    plot_cpu = plot(
+        p1_cpu, p2_cpu, layout = (1, 2), size = (1000, 450), legend = :outertopright)
     @info "Displaying CPU analysis plot..."
     display(plot_cpu)
 end
 ```
 
 # GPU Statistical Analysis and Visualization
+
 Similarly, we can analyze the results from the *GPU simulation*. This requires re-running the simulation to save the time series data and then transferring the data from the GPU memory to the CPU RAM for analysis and plotting using standard tools. Note that this data transfer can be a significant bottleneck for large numbers of trajectories or time steps.
 
 ```@example decay
@@ -192,11 +195,12 @@ gpu_analysis_plot = nothing # Initialize plot variable
 if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
     @info "Re-solving GPU simulation to collect data for plotting..."
     # Add @time to see the impact of saving data
-    @time gpu_sol_plot = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
-                           trajectories = num_trajectories,
-                           save_everystep=true, # <<-- Save data at each dt step on GPU
-                           dt = 0.01f0,
-                           adaptive = false)
+    @time gpu_sol_plot = solve(
+        ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend());
+        trajectories = num_trajectories,
+        save_everystep = true, # <<-- Save data at each dt step on GPU
+        dt = 0.01f0,
+        adaptive = false)
 
     # --- Data Transfer and Analysis ---
     # The result gpu_sol_plot should be an EnsembleSolution containing a Vector{ODESolution}
@@ -213,14 +217,15 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
         # Note: This might be slow/memory intensive!
         @time solutions_vector = Array(gpu_sol_plot)
         if !(solutions_vector isa AbstractVector{<:ODESolution})
-             @error "Could not obtain Vector{ODESolution} from GPU result. Type is $(typeof(solutions_vector)). Aborting GPU analysis."
-             solutions_vector = nothing # Mark as failed
+            @error "Could not obtain Vector{ODESolution} from GPU result. Type is $(typeof(solutions_vector)). Aborting GPU analysis."
+            solutions_vector = nothing # Mark as failed
         end
     end
 
     if solutions_vector !== nothing
         # Extract time points from the first successful trajectory's solution
-        first_successful_gpu_idx = findfirst(sol -> sol.retcode == ReturnCode.Success, solutions_vector)
+        first_successful_gpu_idx = findfirst(
+            sol -> sol.retcode == ReturnCode.Success, solutions_vector)
 
         if first_successful_gpu_idx === nothing
             @error "No successful GPU trajectories found in the returned solutions vector!"
@@ -236,15 +241,15 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
             for i in 1:num_trajectories
                 sol = solutions_vector[i] # Access the i-th ODESolution
                 if sol.retcode == ReturnCode.Success
-                   # Check consistency of time points (optional but good)
-                   if length(sol.t) == num_times_gpu # && sol.t == t_vals_gpu (can be slow check)
+                    # Check consistency of time points (optional but good)
+                    if length(sol.t) == num_times_gpu # && sol.t == t_vals_gpu (can be slow check)
                         # sol.u is likely Vector{SVector{1, Float32}} after transfer
                         ensemble_vals_gpu[:, i] .= getindex.(sol.u, 1)
                         num_processed += 1
-                   else
-                       @warn "GPU Trajectory $i succeeded but time points mismatch (Expected $(num_times_gpu), Got $(length(sol.t))). Skipping."
-                       # Column remains NaN
-                   end
+                    else
+                        @warn "GPU Trajectory $i succeeded but time points mismatch (Expected $(num_times_gpu), Got $(length(sol.t))). Skipping."
+                        # Column remains NaN
+                    end
                 else
                     # @warn "GPU Trajectory $i failed with retcode: $(sol.retcode). Skipping." # Potentially verbose
                     # Column remains NaN
@@ -253,7 +258,8 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
             @info "Processed $num_processed successful GPU trajectories."
 
             # Filter out failed trajectories (columns with NaN)
-            successful_traj_indices_gpu = findall(j -> !all(isnan, view(ensemble_vals_gpu, :, j)), 1:num_trajectories)
+            successful_traj_indices_gpu = findall(
+                j -> !all(isnan, view(ensemble_vals_gpu, :, j)), 1:num_trajectories)
             num_successful_gpu = length(successful_traj_indices_gpu)
 
             if num_successful_gpu == 0
@@ -267,27 +273,29 @@ if gpu_sol_perf !== nothing && CUDA.has_cuda() && CUDA.functional()
                 end
 
                 # Compute ensemble statistics over successful GPU trajectories
-                mean_u_gpu = mapslices(mean, ensemble_vals_gpu, dims=2)[:]
-                std_u_gpu  = mapslices(std, ensemble_vals_gpu, dims=2)[:]
+                mean_u_gpu = mapslices(mean, ensemble_vals_gpu, dims = 2)[:]
+                std_u_gpu = mapslices(std, ensemble_vals_gpu, dims = 2)[:]
 
                 # --- Plotting GPU Results ---
-                p1_gpu = plot(t_vals_gpu, mean_u_gpu, ribbon = std_u_gpu, xlabel = "Time (t)", ylabel = "u(t)",
-                              title = "GPU Ensemble Mean ±1σ ($num_successful_gpu Trajectories)",
-                              label = "Mean u(t)", fillalpha = 0.3, lw=2)
+                p1_gpu = plot(t_vals_gpu, mean_u_gpu, ribbon = std_u_gpu,
+                    xlabel = "Time (t)", ylabel = "u(t)",
+                    title = "GPU Ensemble Mean ±1σ ($num_successful_gpu Trajectories)",
+                    label = "Mean u(t)", fillalpha = 0.3, lw = 2)
 
                 final_vals_gpu = ensemble_vals_gpu[end, :]
-                p2_gpu = histogram(final_vals_gpu, bins = 30, normalize=:probability,
-                                   xlabel = "Final u(T)", ylabel = "Probability Density",
-                                   title = "GPU Distribution of Final Values (t=$(tspan[2]))",
-                                   label = "", legend=false)
+                p2_gpu = histogram(final_vals_gpu, bins = 30, normalize = :probability,
+                    xlabel = "Final u(T)", ylabel = "Probability Density",
+                    title = "GPU Distribution of Final Values (t=$(tspan[2]))",
+                    label = "", legend = false)
 
-                gpu_analysis_plot = plot(p1_gpu, p2_gpu, layout = (1,2), size = (1000, 450), legend=:outertopright)
+                gpu_analysis_plot = plot(p1_gpu, p2_gpu, layout = (1, 2),
+                    size = (1000, 450), legend = :outertopright)
                 @info "Displaying GPU analysis plot..."
                 display(gpu_analysis_plot)
 
                 # Cleanup large structures if memory is a concern
-                 ensemble_vals_gpu = nothing
-                 solutions_vector = nothing
+                ensemble_vals_gpu = nothing
+                solutions_vector = nothing
                 # gc()
             end
         end
@@ -296,4 +304,3 @@ else
     @warn "Skipping GPU analysis section because initial GPU performance run failed or CUDA is unavailable."
 end
 ```
-
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 0df49549..02ffc542 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -146,7 +146,8 @@ prob = ODEProblem{false}(lorenz, u0, tspan, p)
 prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 
-@time sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()), trajectories = 10_000,
+@time sol = solve(
+    monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()), trajectories = 10_000,
     adaptive = false, dt = 0.1f0)
 ```
 """

From b2f7018233896c9e80bc20263a9d56e5e82518b9 Mon Sep 17 00:00:00 2001
From: ParyaRoustaee <ParyaRoustaee>
Date: Fri, 25 Apr 2025 18:06:11 -0700
Subject: [PATCH 12/12] manual formating updates

---
 docs/src/examples/gpu_ensemble_random_decay.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
index b195faf6..4ee64218 100644
--- a/docs/src/examples/gpu_ensemble_random_decay.md
+++ b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -85,7 +85,6 @@ cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
     trajectories = num_trajectories,
     save_everystep = false, # Match GPU setting
     dt = 0.01f0, adaptive = false)
-
 ```
 
 # Performance Comparison
@@ -93,7 +92,6 @@ cpu_sol = solve(ensemble_prob, Tsit5(), EnsembleThreads();
 We re-run the simulations using @time to get a cleaner measurement of the execution time, excluding the initial compilation overhead.
 
 ```@example decay
-
 # --- GPU Timing (Second Run) ---
 if gpu_sol_perf !== nothing
     @info "Timing GPU simulation (second run, no data saving)..."