SciML · utkarsh530 · Dec 20, 2023 · Dec 18, 2023 · Dec 18, 2023 · Dec 19, 2023
diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl
@@ -6,120 +6,51 @@ import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
 
 ## Use lb and ub either as StaticArray or pass them separately as CuArrays
 ## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension
-struct PSOParticle{T1, T2 <: eltype(T1)}
+struct SPSOParticle{T1, T2 <: eltype(T1)}
     position::T1
     velocity::T1
     cost::T2
     best_position::T1
     best_cost::T2
 end
-
-struct PSOGBest{T1, T2 <: eltype(T1)}
+struct SPSOGBest{T1, T2 <: eltype(T1)}
     position::T1
     cost::T2
 end
 
-struct ParallelPSOKernel{Backend}
-    num_particles::Int
-    async::Bool
-    threaded::Bool
-    backend::Backend
-end
-struct ParallelSyncPSO{Backend}
-    num_particles::Int
-    backend::Backend
-end
-
-function ParallelPSOKernel(num_particles::Int;
-        async = false, threaded = false, backend = CPU())
-    ParallelPSOKernel(num_particles, async, threaded, backend)
-end
-
-SciMLBase.allowsbounds(::ParallelPSOKernel) = true
-SciMLBase.allowsbounds(::ParallelSyncPSO) = true
-# SciMLBase.requiresbounds(::ParallelPSOKernel) = true
-
-include("./pso_cpu.jl")
-include("./pso_gpu.jl")
-include("./pso_async_gpu.jl")
-include("./utils.jl")
-include("./pso_sync_gpu.jl")
-include("./ode_pso.jl")
-
-function SciMLBase.__solve(prob::OptimizationProblem,
-        opt::ParallelPSOKernel,
-        args...;
-        kwargs...)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
-
-    prob = remake(prob; lb = lb, ub = ub)
-
-    ## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
-    if opt.backend isa CPU
-        if opt.threaded
-            gbest = PSO(prob; population = opt.num_particles, kwargs...)
-        else
-            init_gbest, particles = init_particles(prob, opt.num_particles)
-            gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
-        end
-    else
-        backend = opt.backend
-        init_gbest, particles = init_particles(prob, opt.num_particles)
-        # TODO: Do the equivalent of cu()/roc()
-        particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
-        gpu_particles = KernelAbstractions.allocate(backend,
-            particles_eltype,
-            size(particles))
-        copyto!(gpu_particles, particles)
-        gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
-        copyto!(gpu_init_gbest, [init_gbest])
-        if opt.async
-            gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
-        else
-            gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
-        end
-    end
-
-    SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        gbest.position, gbest.cost)
+mutable struct MPSOParticle{T}
+    position::AbstractArray{T}
+    velocity::AbstractArray{T}
+    cost::T
+    best_position::AbstractArray{T}
+    best_cost::T
 end
-
-function SciMLBase.__solve(prob::OptimizationProblem,
-        opt::ParallelSyncPSO,
-        args...;
-        kwargs...)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
-
-    prob = remake(prob; lb = lb, ub = ub)
-    backend = opt.backend
-    init_gbest, particles = init_particles(prob, opt.num_particles)
-    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
-    gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
-    copyto!(gpu_particles, particles)
-    init_gbest = init_gbest
-    gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
-
-    SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        gbest.position, gbest.cost)
+mutable struct MPSOGBest{T}
+    position::AbstractArray{T}
+    cost::T
 end
 
-using Base
-
 ## required overloads for min or max computation on particles
-function Base.isless(a::PSOGPU.PSOParticle{T1, T2},
-        b::PSOGPU.PSOParticle{T1, T2}) where {T1, T2}
+function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
+        b::PSOGPU.SPSOParticle{T1, T2}) where {T1, T2}
     a.best_cost < b.best_cost
 end
 
-function Base.typemax(::Type{PSOGPU.PSOParticle{T1, T2}}) where {T1, T2}
-    PSOGPU.PSOParticle{T1, T2}(similar(T1),
+function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
+    PSOGPU.SPSOParticle{T1, T2}(similar(T1),
         similar(T1),
         typemax(T2),
         similar(T1),
         typemax(T2))
 end
 
-export ParallelPSOKernel, ParallelSyncPSO, OptimizationProblem, solve
+include("./algorithms.jl")
+include("./utils.jl")
+include("./ode_pso.jl")
+include("./kernels.jl")
+include("./lowerlevel_solve.jl")
+include("./solve.jl")
+
+export ParallelPSOKernel,
+    ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve
 end
diff --git a/src/algorithms.jl b/src/algorithms.jl
@@ -0,0 +1,99 @@
+
+abstract type PSOAlogrithm end
+
+"""
+```julia
+ParallelPSOKernel(num_particles = 100)
+```
+
+Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
+on a GPU. Static Arrays for parameters in the `OptimizationProblem` are required for successful GPU compilation.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+- global_update: defaults to `true`. Setting it to false allows particles to evolve completely on their own,
+  i.e. no information is sent about the global best position.
+- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
+
+## Limitations
+
+Running the optimization with `global_update=true` updates the global best positions with possible thread races.
+This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
+updates can be used to fix this.
+
+"""
+struct ParallelPSOKernel{Backend} <: PSOAlogrithm
+    num_particles::Int
+    global_update::Bool
+    backend::Backend
+end
+
+"""
+```julia
+ParallelSyncPSOKernel(num_particles = 100)
+```
+
+Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
+on a GPU. However, it requires a synchronization after each generation to calculate the global best position of the particles.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
+
+"""
+struct ParallelSyncPSOKernel{Backend} <: PSOAlogrithm
+    num_particles::Int
+    backend::Backend
+end
+
+"""
+```julia
+ParallelPSOArray(num_particles = 100)
+```
+Particle Swarm Optimization on a CPU. It keeps the arrays used in particle data structure
+to be Julia's `Array`, which may be better for high-dimensional problems.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+
+
+## Limitations
+
+Running the optimization updates the global best positions with possible thread races.
+This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
+updates can be used to fix this.
+
+"""
+struct ParallelPSOArray <: PSOAlogrithm
+    num_particles::Int
+end
+
+"""
+```julia
+SerialPSO(num_particles = 100)
+```
+Serial Particle Swarm Optimization on a CPU.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+
+"""
+struct SerialPSO <: PSOAlogrithm
+    num_particles::Int
+end
+
+function ParallelPSOKernel(num_particles::Int;
+        global_update = false, backend = CPU())
+    ParallelPSOKernel(num_particles, global_update, backend)
+end
+
+function ParallelSyncPSOKernel(num_particles::Int;
+        backend = CPU())
+    ParallelSyncPSOKernel(num_particles, backend)
+end
+
+SciMLBase.allowsbounds(::PSOAlogrithm) = true
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -0,0 +1,85 @@
+@inline function update_particle_state(particle, prob, gbest, w, c1, c2)
+    updated_velocity = w .* particle.velocity .+
+                       c1 .* rand(typeof(particle.velocity)) .*
+                       (particle.best_position -
+                        particle.position) .+
+                       c2 .* rand(typeof(particle.velocity)) .*
+                       (gbest.position - particle.position)
+
+    @set! particle.velocity = updated_velocity
+
+    @set! particle.position = particle.position + particle.velocity
+
+    update_pos = max.(particle.position, prob.lb)
+    update_pos = min.(update_pos, prob.ub)
+    @set! particle.position = update_pos
+    # @set! particle.position = min(particle.position, ub)
+
+    @set! particle.cost = prob.f(particle.position, prob.p)
+
+    if particle.cost < particle.best_cost
+        @set! particle.best_position = particle.position
+        @set! particle.best_cost = particle.cost
+    end
+    particle
+end
+
+@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
+        opt::ParallelPSOKernel; c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    @inbounds gbest = gbest_ref[1]
+    @inbounds particle = gpu_particles[i]
+
+    particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+
+    ## NOTE: This causes thread races to update global best particle.
+    if particle.best_cost < gbest.cost
+        @set! gbest.position = particle.best_position
+        @set! gbest.cost = particle.best_cost
+    end
+
+    @inbounds gbest_ref[1] = gbest
+
+    @inbounds gpu_particles[i] = particle
+end
+
+@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
+        opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    @inbounds particle = gpu_particles[i]
+
+    particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+
+    @inbounds gpu_particles[i] = particle
+end
+
+@kernel function update_particle_states_async!(prob,
+        gpu_particles,
+        gbest_ref,
+        w, wdamp, maxiters;
+        c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    gbest = gbest_ref[1]
+
+    ## Access the particle
+    @inbounds particle = gpu_particles[i]
+
+    ## Run all generations
+    for i in 1:maxiters
+        particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+        if particle.best_cost < gbest.cost
+            @set! gbest.position = particle.best_position
+            @set! gbest.cost = particle.best_cost
+        end
+        w = w * wdamp
+    end
+
+    @inbounds gpu_particles[i] = particle
+    @inbounds gbest_ref[1] = gbest
+end