Skip to content

Commit 351f74c

Browse files
authored
Merge pull request #34 from SciML/u/refactor
Towards cleaner and modular internals in the code
2 parents 1e5ce2d + 5bc4a3f commit 351f74c

File tree

16 files changed

+536
-616
lines changed

16 files changed

+536
-616
lines changed

src/PSOGPU.jl

Lines changed: 24 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -6,120 +6,51 @@ import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
66

77
## Use lb and ub either as StaticArray or pass them separately as CuArrays
88
## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension
9-
struct PSOParticle{T1, T2 <: eltype(T1)}
9+
struct SPSOParticle{T1, T2 <: eltype(T1)}
1010
position::T1
1111
velocity::T1
1212
cost::T2
1313
best_position::T1
1414
best_cost::T2
1515
end
16-
17-
struct PSOGBest{T1, T2 <: eltype(T1)}
16+
struct SPSOGBest{T1, T2 <: eltype(T1)}
1817
position::T1
1918
cost::T2
2019
end
2120

22-
struct ParallelPSOKernel{Backend}
23-
num_particles::Int
24-
async::Bool
25-
threaded::Bool
26-
backend::Backend
27-
end
28-
struct ParallelSyncPSO{Backend}
29-
num_particles::Int
30-
backend::Backend
31-
end
32-
33-
function ParallelPSOKernel(num_particles::Int;
34-
async = false, threaded = false, backend = CPU())
35-
ParallelPSOKernel(num_particles, async, threaded, backend)
36-
end
37-
38-
SciMLBase.allowsbounds(::ParallelPSOKernel) = true
39-
SciMLBase.allowsbounds(::ParallelSyncPSO) = true
40-
# SciMLBase.requiresbounds(::ParallelPSOKernel) = true
41-
42-
include("./pso_cpu.jl")
43-
include("./pso_gpu.jl")
44-
include("./pso_async_gpu.jl")
45-
include("./utils.jl")
46-
include("./pso_sync_gpu.jl")
47-
include("./ode_pso.jl")
48-
49-
function SciMLBase.__solve(prob::OptimizationProblem,
50-
opt::ParallelPSOKernel,
51-
args...;
52-
kwargs...)
53-
lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
54-
ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
55-
56-
prob = remake(prob; lb = lb, ub = ub)
57-
58-
## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
59-
if opt.backend isa CPU
60-
if opt.threaded
61-
gbest = PSO(prob; population = opt.num_particles, kwargs...)
62-
else
63-
init_gbest, particles = init_particles(prob, opt.num_particles)
64-
gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
65-
end
66-
else
67-
backend = opt.backend
68-
init_gbest, particles = init_particles(prob, opt.num_particles)
69-
# TODO: Do the equivalent of cu()/roc()
70-
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
71-
gpu_particles = KernelAbstractions.allocate(backend,
72-
particles_eltype,
73-
size(particles))
74-
copyto!(gpu_particles, particles)
75-
gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
76-
copyto!(gpu_init_gbest, [init_gbest])
77-
if opt.async
78-
gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
79-
else
80-
gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
81-
end
82-
end
83-
84-
SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
85-
gbest.position, gbest.cost)
21+
mutable struct MPSOParticle{T}
22+
position::AbstractArray{T}
23+
velocity::AbstractArray{T}
24+
cost::T
25+
best_position::AbstractArray{T}
26+
best_cost::T
8627
end
87-
88-
function SciMLBase.__solve(prob::OptimizationProblem,
89-
opt::ParallelSyncPSO,
90-
args...;
91-
kwargs...)
92-
lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
93-
ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
94-
95-
prob = remake(prob; lb = lb, ub = ub)
96-
backend = opt.backend
97-
init_gbest, particles = init_particles(prob, opt.num_particles)
98-
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
99-
gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
100-
copyto!(gpu_particles, particles)
101-
init_gbest = init_gbest
102-
gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
103-
104-
SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
105-
gbest.position, gbest.cost)
28+
mutable struct MPSOGBest{T}
29+
position::AbstractArray{T}
30+
cost::T
10631
end
10732

108-
using Base
109-
11033
## required overloads for min or max computation on particles
111-
function Base.isless(a::PSOGPU.PSOParticle{T1, T2},
112-
b::PSOGPU.PSOParticle{T1, T2}) where {T1, T2}
34+
function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
35+
b::PSOGPU.SPSOParticle{T1, T2}) where {T1, T2}
11336
a.best_cost < b.best_cost
11437
end
11538

116-
function Base.typemax(::Type{PSOGPU.PSOParticle{T1, T2}}) where {T1, T2}
117-
PSOGPU.PSOParticle{T1, T2}(similar(T1),
39+
function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
40+
PSOGPU.SPSOParticle{T1, T2}(similar(T1),
11841
similar(T1),
11942
typemax(T2),
12043
similar(T1),
12144
typemax(T2))
12245
end
12346

124-
export ParallelPSOKernel, ParallelSyncPSO, OptimizationProblem, solve
47+
include("./algorithms.jl")
48+
include("./utils.jl")
49+
include("./ode_pso.jl")
50+
include("./kernels.jl")
51+
include("./lowerlevel_solve.jl")
52+
include("./solve.jl")
53+
54+
export ParallelPSOKernel,
55+
ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve
12556
end

src/algorithms.jl

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
2+
abstract type PSOAlogrithm end
3+
4+
"""
5+
```julia
6+
ParallelPSOKernel(num_particles = 100)
7+
```
8+
9+
Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
10+
on a GPU. Static Arrays for parameters in the `OptimizationProblem` are required for successful GPU compilation.
11+
12+
## Positional arguments:
13+
14+
- num_particles: Number of particles in the simulation
15+
- global_update: defaults to `true`. Setting it to false allows particles to evolve completely on their own,
16+
i.e. no information is sent about the global best position.
17+
- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
18+
19+
## Limitations
20+
21+
Running the optimization with `global_update=true` updates the global best positions with possible thread races.
22+
This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
23+
updates can be used to fix this.
24+
25+
"""
26+
struct ParallelPSOKernel{Backend} <: PSOAlogrithm
27+
num_particles::Int
28+
global_update::Bool
29+
backend::Backend
30+
end
31+
32+
"""
33+
```julia
34+
ParallelSyncPSOKernel(num_particles = 100)
35+
```
36+
37+
Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
38+
on a GPU. However, it requires a synchronization after each generation to calculate the global best position of the particles.
39+
40+
## Positional arguments:
41+
42+
- num_particles: Number of particles in the simulation
43+
- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
44+
45+
"""
46+
struct ParallelSyncPSOKernel{Backend} <: PSOAlogrithm
47+
num_particles::Int
48+
backend::Backend
49+
end
50+
51+
"""
52+
```julia
53+
ParallelPSOArray(num_particles = 100)
54+
```
55+
Particle Swarm Optimization on a CPU. It keeps the arrays used in particle data structure
56+
to be Julia's `Array`, which may be better for high-dimensional problems.
57+
58+
## Positional arguments:
59+
60+
- num_particles: Number of particles in the simulation
61+
62+
63+
## Limitations
64+
65+
Running the optimization updates the global best positions with possible thread races.
66+
This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
67+
updates can be used to fix this.
68+
69+
"""
70+
struct ParallelPSOArray <: PSOAlogrithm
71+
num_particles::Int
72+
end
73+
74+
"""
75+
```julia
76+
SerialPSO(num_particles = 100)
77+
```
78+
Serial Particle Swarm Optimization on a CPU.
79+
80+
## Positional arguments:
81+
82+
- num_particles: Number of particles in the simulation
83+
84+
"""
85+
struct SerialPSO <: PSOAlogrithm
86+
num_particles::Int
87+
end
88+
89+
function ParallelPSOKernel(num_particles::Int;
90+
global_update = true, backend = CPU())
91+
ParallelPSOKernel(num_particles, global_update, backend)
92+
end
93+
94+
function ParallelSyncPSOKernel(num_particles::Int;
95+
backend = CPU())
96+
ParallelSyncPSOKernel(num_particles, backend)
97+
end
98+
99+
SciMLBase.allowsbounds(::PSOAlogrithm) = true

src/kernels.jl

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
@inline function update_particle_state(particle, prob, gbest, w, c1, c2)
2+
updated_velocity = w .* particle.velocity .+
3+
c1 .* rand(typeof(particle.velocity)) .*
4+
(particle.best_position -
5+
particle.position) .+
6+
c2 .* rand(typeof(particle.velocity)) .*
7+
(gbest.position - particle.position)
8+
9+
@set! particle.velocity = updated_velocity
10+
11+
@set! particle.position = particle.position + particle.velocity
12+
13+
update_pos = max.(particle.position, prob.lb)
14+
update_pos = min.(update_pos, prob.ub)
15+
@set! particle.position = update_pos
16+
17+
@set! particle.cost = prob.f(particle.position, prob.p)
18+
19+
if particle.cost < particle.best_cost
20+
@set! particle.best_position = particle.position
21+
@set! particle.best_cost = particle.cost
22+
end
23+
particle
24+
end
25+
26+
@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
27+
opt::ParallelPSOKernel; c1 = 1.4962f0,
28+
c2 = 1.4962f0)
29+
i = @index(Global, Linear)
30+
31+
@inbounds gbest = gbest_ref[1]
32+
@inbounds particle = gpu_particles[i]
33+
34+
particle = update_particle_state(particle, prob, gbest, w, c1, c2)
35+
36+
## NOTE: This causes thread races to update global best particle.
37+
if particle.best_cost < gbest.cost
38+
@set! gbest.position = particle.best_position
39+
@set! gbest.cost = particle.best_cost
40+
end
41+
42+
@inbounds gbest_ref[1] = gbest
43+
44+
@inbounds gpu_particles[i] = particle
45+
end
46+
47+
@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
48+
opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
49+
c2 = 1.4962f0)
50+
i = @index(Global, Linear)
51+
52+
@inbounds particle = gpu_particles[i]
53+
54+
particle = update_particle_state(particle, prob, gbest, w, c1, c2)
55+
56+
@inbounds gpu_particles[i] = particle
57+
end
58+
59+
@kernel function update_particle_states_async!(prob,
60+
gpu_particles,
61+
gbest_ref,
62+
w, wdamp, maxiters;
63+
c1 = 1.4962f0,
64+
c2 = 1.4962f0)
65+
i = @index(Global, Linear)
66+
67+
gbest = gbest_ref[1]
68+
69+
## Access the particle
70+
@inbounds particle = gpu_particles[i]
71+
72+
## Run all generations
73+
for i in 1:maxiters
74+
particle = update_particle_state(particle, prob, gbest, w, c1, c2)
75+
if particle.best_cost < gbest.cost
76+
@set! gbest.position = particle.best_position
77+
@set! gbest.cost = particle.best_cost
78+
end
79+
w = w * wdamp
80+
end
81+
82+
@inbounds gpu_particles[i] = particle
83+
@inbounds gbest_ref[1] = gbest
84+
end

0 commit comments

Comments
 (0)