Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 24 additions & 93 deletions src/PSOGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,120 +6,51 @@ import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible

## Use lb and ub either as StaticArray or pass them separately as CuArrays
## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension
struct PSOParticle{T1, T2 <: eltype(T1)}
struct SPSOParticle{T1, T2 <: eltype(T1)}
position::T1
velocity::T1
cost::T2
best_position::T1
best_cost::T2
end

struct PSOGBest{T1, T2 <: eltype(T1)}
struct SPSOGBest{T1, T2 <: eltype(T1)}
position::T1
cost::T2
end

struct ParallelPSOKernel{Backend}
num_particles::Int
async::Bool
threaded::Bool
backend::Backend
end
struct ParallelSyncPSO{Backend}
num_particles::Int
backend::Backend
end

function ParallelPSOKernel(num_particles::Int;
async = false, threaded = false, backend = CPU())
ParallelPSOKernel(num_particles, async, threaded, backend)
end

SciMLBase.allowsbounds(::ParallelPSOKernel) = true
SciMLBase.allowsbounds(::ParallelSyncPSO) = true
# SciMLBase.requiresbounds(::ParallelPSOKernel) = true

include("./pso_cpu.jl")
include("./pso_gpu.jl")
include("./pso_async_gpu.jl")
include("./utils.jl")
include("./pso_sync_gpu.jl")
include("./ode_pso.jl")

function SciMLBase.__solve(prob::OptimizationProblem,
opt::ParallelPSOKernel,
args...;
kwargs...)
lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub

prob = remake(prob; lb = lb, ub = ub)

## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
if opt.backend isa CPU
if opt.threaded
gbest = PSO(prob; population = opt.num_particles, kwargs...)
else
init_gbest, particles = init_particles(prob, opt.num_particles)
gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
end
else
backend = opt.backend
init_gbest, particles = init_particles(prob, opt.num_particles)
# TODO: Do the equivalent of cu()/roc()
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
gpu_particles = KernelAbstractions.allocate(backend,
particles_eltype,
size(particles))
copyto!(gpu_particles, particles)
gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
copyto!(gpu_init_gbest, [init_gbest])
if opt.async
gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
else
gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
end
end

SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
gbest.position, gbest.cost)
mutable struct MPSOParticle{T}
position::AbstractArray{T}
velocity::AbstractArray{T}
cost::T
best_position::AbstractArray{T}
best_cost::T
end

function SciMLBase.__solve(prob::OptimizationProblem,
opt::ParallelSyncPSO,
args...;
kwargs...)
lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub

prob = remake(prob; lb = lb, ub = ub)
backend = opt.backend
init_gbest, particles = init_particles(prob, opt.num_particles)
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
copyto!(gpu_particles, particles)
init_gbest = init_gbest
gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)

SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
gbest.position, gbest.cost)
mutable struct MPSOGBest{T}
position::AbstractArray{T}
cost::T
end

using Base

## required overloads for min or max computation on particles
function Base.isless(a::PSOGPU.PSOParticle{T1, T2},
b::PSOGPU.PSOParticle{T1, T2}) where {T1, T2}
function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
b::PSOGPU.SPSOParticle{T1, T2}) where {T1, T2}
a.best_cost < b.best_cost
end

function Base.typemax(::Type{PSOGPU.PSOParticle{T1, T2}}) where {T1, T2}
PSOGPU.PSOParticle{T1, T2}(similar(T1),
function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
PSOGPU.SPSOParticle{T1, T2}(similar(T1),
similar(T1),
typemax(T2),
similar(T1),
typemax(T2))
end

export ParallelPSOKernel, ParallelSyncPSO, OptimizationProblem, solve
include("./algorithms.jl")
include("./utils.jl")
include("./ode_pso.jl")
include("./kernels.jl")
include("./lowerlevel_solve.jl")
include("./solve.jl")

export ParallelPSOKernel,
ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve
end
99 changes: 99 additions & 0 deletions src/algorithms.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@

abstract type PSOAlogrithm end

"""
```julia
ParallelPSOKernel(num_particles = 100)
```

Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
on a GPU. Static Arrays for parameters in the `OptimizationProblem` are required for successful GPU compilation.

## Positional arguments:

- num_particles: Number of particles in the simulation
- global_update: defaults to `true`. Setting it to false allows particles to evolve completely on their own,
i.e. no information is sent about the global best position.
- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.

## Limitations

Running the optimization with `global_update=true` updates the global best positions with possible thread races.
This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
updates can be used to fix this.

"""
struct ParallelPSOKernel{Backend} <: PSOAlogrithm
num_particles::Int
global_update::Bool
backend::Backend
end

"""
```julia
ParallelSyncPSOKernel(num_particles = 100)
```

Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
on a GPU. However, it requires a synchronization after each generation to calculate the global best position of the particles.

## Positional arguments:

- num_particles: Number of particles in the simulation
- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.

"""
struct ParallelSyncPSOKernel{Backend} <: PSOAlogrithm
num_particles::Int
backend::Backend
end

"""
```julia
ParallelPSOArray(num_particles = 100)
```
Particle Swarm Optimization on a CPU. It keeps the arrays used in particle data structure
to be Julia's `Array`, which may be better for high-dimensional problems.

## Positional arguments:

- num_particles: Number of particles in the simulation


## Limitations

Running the optimization updates the global best positions with possible thread races.
This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
updates can be used to fix this.

"""
struct ParallelPSOArray <: PSOAlogrithm
num_particles::Int
end

"""
```julia
SerialPSO(num_particles = 100)
```
Serial Particle Swarm Optimization on a CPU.

## Positional arguments:

- num_particles: Number of particles in the simulation

"""
struct SerialPSO <: PSOAlogrithm
num_particles::Int
end

function ParallelPSOKernel(num_particles::Int;
global_update = false, backend = CPU())
ParallelPSOKernel(num_particles, global_update, backend)
end

function ParallelSyncPSOKernel(num_particles::Int;
backend = CPU())
ParallelSyncPSOKernel(num_particles, backend)
end

SciMLBase.allowsbounds(::PSOAlogrithm) = true
85 changes: 85 additions & 0 deletions src/kernels.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
@inline function update_particle_state(particle, prob, gbest, w, c1, c2)
updated_velocity = w .* particle.velocity .+
c1 .* rand(typeof(particle.velocity)) .*
(particle.best_position -
particle.position) .+
c2 .* rand(typeof(particle.velocity)) .*
(gbest.position - particle.position)

@set! particle.velocity = updated_velocity

@set! particle.position = particle.position + particle.velocity

update_pos = max.(particle.position, prob.lb)
update_pos = min.(update_pos, prob.ub)
@set! particle.position = update_pos
# @set! particle.position = min(particle.position, ub)

@set! particle.cost = prob.f(particle.position, prob.p)

if particle.cost < particle.best_cost
@set! particle.best_position = particle.position
@set! particle.best_cost = particle.cost
end
particle
end

@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
opt::ParallelPSOKernel; c1 = 1.4962f0,
c2 = 1.4962f0)
i = @index(Global, Linear)

@inbounds gbest = gbest_ref[1]
@inbounds particle = gpu_particles[i]

particle = update_particle_state(particle, prob, gbest, w, c1, c2)

## NOTE: This causes thread races to update global best particle.
if particle.best_cost < gbest.cost
@set! gbest.position = particle.best_position
@set! gbest.cost = particle.best_cost
end

@inbounds gbest_ref[1] = gbest

@inbounds gpu_particles[i] = particle
end

@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
c2 = 1.4962f0)
i = @index(Global, Linear)

@inbounds particle = gpu_particles[i]

particle = update_particle_state(particle, prob, gbest, w, c1, c2)

@inbounds gpu_particles[i] = particle
end

@kernel function update_particle_states_async!(prob,
gpu_particles,
gbest_ref,
w, wdamp, maxiters;
c1 = 1.4962f0,
c2 = 1.4962f0)
i = @index(Global, Linear)

gbest = gbest_ref[1]

## Access the particle
@inbounds particle = gpu_particles[i]

## Run all generations
for i in 1:maxiters
particle = update_particle_state(particle, prob, gbest, w, c1, c2)
if particle.best_cost < gbest.cost
@set! gbest.position = particle.best_position
@set! gbest.cost = particle.best_cost
end
w = w * wdamp
end

@inbounds gpu_particles[i] = particle
@inbounds gbest_ref[1] = gbest
end
Loading