diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl index f4110e6..fa1aaef 100644 --- a/src/PSOGPU.jl +++ b/src/PSOGPU.jl @@ -6,120 +6,51 @@ import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible ## Use lb and ub either as StaticArray or pass them separately as CuArrays ## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension -struct PSOParticle{T1, T2 <: eltype(T1)} +struct SPSOParticle{T1, T2 <: eltype(T1)} position::T1 velocity::T1 cost::T2 best_position::T1 best_cost::T2 end - -struct PSOGBest{T1, T2 <: eltype(T1)} +struct SPSOGBest{T1, T2 <: eltype(T1)} position::T1 cost::T2 end -struct ParallelPSOKernel{Backend} - num_particles::Int - async::Bool - threaded::Bool - backend::Backend -end -struct ParallelSyncPSO{Backend} - num_particles::Int - backend::Backend -end - -function ParallelPSOKernel(num_particles::Int; - async = false, threaded = false, backend = CPU()) - ParallelPSOKernel(num_particles, async, threaded, backend) -end - -SciMLBase.allowsbounds(::ParallelPSOKernel) = true -SciMLBase.allowsbounds(::ParallelSyncPSO) = true -# SciMLBase.requiresbounds(::ParallelPSOKernel) = true - -include("./pso_cpu.jl") -include("./pso_gpu.jl") -include("./pso_async_gpu.jl") -include("./utils.jl") -include("./pso_sync_gpu.jl") -include("./ode_pso.jl") - -function SciMLBase.__solve(prob::OptimizationProblem, - opt::ParallelPSOKernel, - args...; - kwargs...) - lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb - ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub - - prob = remake(prob; lb = lb, ub = ub) - - ## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations - if opt.backend isa CPU - if opt.threaded - gbest = PSO(prob; population = opt.num_particles, kwargs...) - else - init_gbest, particles = init_particles(prob, opt.num_particles) - gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...) - end - else - backend = opt.backend - init_gbest, particles = init_particles(prob, opt.num_particles) - # TODO: Do the equivalent of cu()/roc() - particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles) - gpu_particles = KernelAbstractions.allocate(backend, - particles_eltype, - size(particles)) - copyto!(gpu_particles, particles) - gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,)) - copyto!(gpu_init_gbest, [init_gbest]) - if opt.async - gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...) - else - gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...) - end - end - - SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - gbest.position, gbest.cost) +mutable struct MPSOParticle{T} + position::AbstractArray{T} + velocity::AbstractArray{T} + cost::T + best_position::AbstractArray{T} + best_cost::T end - -function SciMLBase.__solve(prob::OptimizationProblem, - opt::ParallelSyncPSO, - args...; - kwargs...) - lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb - ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub - - prob = remake(prob; lb = lb, ub = ub) - backend = opt.backend - init_gbest, particles = init_particles(prob, opt.num_particles) - particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles) - gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles)) - copyto!(gpu_particles, particles) - init_gbest = init_gbest - gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...) - - SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, - gbest.position, gbest.cost) +mutable struct MPSOGBest{T} + position::AbstractArray{T} + cost::T end -using Base - ## required overloads for min or max computation on particles -function Base.isless(a::PSOGPU.PSOParticle{T1, T2}, - b::PSOGPU.PSOParticle{T1, T2}) where {T1, T2} +function Base.isless(a::PSOGPU.SPSOParticle{T1, T2}, + b::PSOGPU.SPSOParticle{T1, T2}) where {T1, T2} a.best_cost < b.best_cost end -function Base.typemax(::Type{PSOGPU.PSOParticle{T1, T2}}) where {T1, T2} - PSOGPU.PSOParticle{T1, T2}(similar(T1), +function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2} + PSOGPU.SPSOParticle{T1, T2}(similar(T1), similar(T1), typemax(T2), similar(T1), typemax(T2)) end -export ParallelPSOKernel, ParallelSyncPSO, OptimizationProblem, solve +include("./algorithms.jl") +include("./utils.jl") +include("./ode_pso.jl") +include("./kernels.jl") +include("./lowerlevel_solve.jl") +include("./solve.jl") + +export ParallelPSOKernel, + ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve end diff --git a/src/algorithms.jl b/src/algorithms.jl new file mode 100644 index 0000000..bf12e80 --- /dev/null +++ b/src/algorithms.jl @@ -0,0 +1,99 @@ + +abstract type PSOAlogrithm end + +""" +```julia +ParallelPSOKernel(num_particles = 100) +``` + +Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel +on a GPU. Static Arrays for parameters in the `OptimizationProblem` are required for successful GPU compilation. + +## Positional arguments: + +- num_particles: Number of particles in the simulation +- global_update: defaults to `true`. Setting it to false allows particles to evolve completely on their own, + i.e. no information is sent about the global best position. +- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation. + +## Limitations + +Running the optimization with `global_update=true` updates the global best positions with possible thread races. +This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic +updates can be used to fix this. + +""" +struct ParallelPSOKernel{Backend} <: PSOAlogrithm + num_particles::Int + global_update::Bool + backend::Backend +end + +""" +```julia +ParallelSyncPSOKernel(num_particles = 100) +``` + +Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel +on a GPU. However, it requires a synchronization after each generation to calculate the global best position of the particles. + +## Positional arguments: + +- num_particles: Number of particles in the simulation +- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation. + +""" +struct ParallelSyncPSOKernel{Backend} <: PSOAlogrithm + num_particles::Int + backend::Backend +end + +""" +```julia +ParallelPSOArray(num_particles = 100) +``` +Particle Swarm Optimization on a CPU. It keeps the arrays used in particle data structure +to be Julia's `Array`, which may be better for high-dimensional problems. + +## Positional arguments: + +- num_particles: Number of particles in the simulation + + +## Limitations + +Running the optimization updates the global best positions with possible thread races. +This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic +updates can be used to fix this. + +""" +struct ParallelPSOArray <: PSOAlogrithm + num_particles::Int +end + +""" +```julia +SerialPSO(num_particles = 100) +``` +Serial Particle Swarm Optimization on a CPU. + +## Positional arguments: + +- num_particles: Number of particles in the simulation + +""" +struct SerialPSO <: PSOAlogrithm + num_particles::Int +end + +function ParallelPSOKernel(num_particles::Int; + global_update = true, backend = CPU()) + ParallelPSOKernel(num_particles, global_update, backend) +end + +function ParallelSyncPSOKernel(num_particles::Int; + backend = CPU()) + ParallelSyncPSOKernel(num_particles, backend) +end + +SciMLBase.allowsbounds(::PSOAlogrithm) = true diff --git a/src/kernels.jl b/src/kernels.jl new file mode 100644 index 0000000..14655d6 --- /dev/null +++ b/src/kernels.jl @@ -0,0 +1,84 @@ +@inline function update_particle_state(particle, prob, gbest, w, c1, c2) + updated_velocity = w .* particle.velocity .+ + c1 .* rand(typeof(particle.velocity)) .* + (particle.best_position - + particle.position) .+ + c2 .* rand(typeof(particle.velocity)) .* + (gbest.position - particle.position) + + @set! particle.velocity = updated_velocity + + @set! particle.position = particle.position + particle.velocity + + update_pos = max.(particle.position, prob.lb) + update_pos = min.(update_pos, prob.ub) + @set! particle.position = update_pos + + @set! particle.cost = prob.f(particle.position, prob.p) + + if particle.cost < particle.best_cost + @set! particle.best_position = particle.position + @set! particle.best_cost = particle.cost + end + particle +end + +@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w, + opt::ParallelPSOKernel; c1 = 1.4962f0, + c2 = 1.4962f0) + i = @index(Global, Linear) + + @inbounds gbest = gbest_ref[1] + @inbounds particle = gpu_particles[i] + + particle = update_particle_state(particle, prob, gbest, w, c1, c2) + + ## NOTE: This causes thread races to update global best particle. + if particle.best_cost < gbest.cost + @set! gbest.position = particle.best_position + @set! gbest.cost = particle.best_cost + end + + @inbounds gbest_ref[1] = gbest + + @inbounds gpu_particles[i] = particle +end + +@kernel function update_particle_states!(prob, gpu_particles, gbest, w, + opt::ParallelSyncPSOKernel; c1 = 1.4962f0, + c2 = 1.4962f0) + i = @index(Global, Linear) + + @inbounds particle = gpu_particles[i] + + particle = update_particle_state(particle, prob, gbest, w, c1, c2) + + @inbounds gpu_particles[i] = particle +end + +@kernel function update_particle_states_async!(prob, + gpu_particles, + gbest_ref, + w, wdamp, maxiters; + c1 = 1.4962f0, + c2 = 1.4962f0) + i = @index(Global, Linear) + + gbest = gbest_ref[1] + + ## Access the particle + @inbounds particle = gpu_particles[i] + + ## Run all generations + for i in 1:maxiters + particle = update_particle_state(particle, prob, gbest, w, c1, c2) + if particle.best_cost < gbest.cost + @set! gbest.position = particle.best_position + @set! gbest.cost = particle.best_cost + end + w = w * wdamp + end + + @inbounds gpu_particles[i] = particle + @inbounds gbest_ref[1] = gbest +end diff --git a/src/lowerlevel_solve.jl b/src/lowerlevel_solve.jl new file mode 100644 index 0000000..f66f145 --- /dev/null +++ b/src/lowerlevel_solve.jl @@ -0,0 +1,141 @@ +function vectorized_solve!(prob, + gbest, + gpu_particles, opt::ParallelSyncPSOKernel; + maxiters = 100, + w = 0.7298f0, + wdamp = 1.0f0, + debug = false) + backend = get_backend(gpu_particles) + + update_particle_kernel = update_particle_states!(backend) + + for i in 1:maxiters + update_particle_kernel(prob, + gpu_particles, + gbest, + w, opt; + ndrange = length(gpu_particles)) + best_particle = minimum(gpu_particles) + gbest = SPSOGBest(best_particle.position, best_particle.best_cost) + w = w * wdamp + end + + return gbest +end + +function vectorized_solve!(prob, + gbest, + gpu_particles, opt::ParallelPSOKernel, ::Val{true}; + maxiters = 100, + w = 0.7298f0, + wdamp = 1.0f0, + debug = false) + + ## Initialize stuff + + backend = get_backend(gpu_particles) + + kernel = update_particle_states!(backend) + + for i in 1:maxiters + ## Invoke GPU Kernel here + kernel(prob, gpu_particles, gbest, w, opt; ndrange = length(gpu_particles)) + w = w * wdamp + end + + return Array(gbest)[1] +end + +function vectorized_solve!(prob, + gbest, + gpu_particles, opt::ParallelPSOKernel, ::Val{false}; + maxiters = 100, + w = 0.7298f0, + wdamp = 1.0f0, + debug = false) + backend = get_backend(gpu_particles) + + kernel = update_particle_states_async!(backend) + kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange = length(gpu_particles)) + + best_particle = minimum(gpu_particles) + return SPSOGBest(best_particle.best_position, best_particle.best_cost) +end + +function vectorized_solve!(prob, gbest, + particles, opt::ParallelPSOArray; + maxiters = 100, + w = 0.7298f0, + wdamp = 1.0f0, + c1 = 1.4962f0, + c2 = 1.4962f0, + verbose = false) + cost_func = prob.f + num_particles = length(particles) + rand_eltype = eltype(particles[1].velocity) + # main loop + + for iter in 1:maxiters + Threads.@threads for i in 1:num_particles + particles[i].velocity .= w .* particles[i].velocity .+ + c1 .* rand.(rand_eltype) .* + (particles[i].best_position .- + particles[i].position) .+ + c2 .* rand.(rand_eltype) .* + (gbest.position .- particles[i].position) + + particles[i].position .= particles[i].position .+ particles[i].velocity + particles[i].position .= max.(particles[i].position, prob.lb) + particles[i].position .= min.(particles[i].position, prob.ub) + + particles[i].cost = cost_func(particles[i].position, prob.p) + + if particles[i].cost < particles[i].best_cost + copy!(particles[i].best_position, particles[i].position) + particles[i].best_cost = particles[i].cost + + ## Possible race condition here + if particles[i].best_cost < gbest.cost + copy!(gbest.position, particles[i].best_position) + gbest.cost = particles[i].best_cost + end + end + end + w = w * wdamp + end + gbest +end + +function update_particle_states_cpu!(prob, particles, gbest_ref, w; c1 = 1.4962f0, + c2 = 1.4962f0) + gbest = gbest_ref[] + + for i in eachindex(particles) + @inbounds particle = particles[i] + particle = update_particle_state(particle, prob, gbest, w, c1, c2) + + if particle.best_cost < gbest.cost + @set! gbest.position = particle.best_position + @set! gbest.cost = particle.best_cost + end + + particles[i] = particle + end + gbest_ref[] = gbest + return nothing +end + +function vectorized_solve!(prob, + gbest, + particles, opt::SerialPSO; + maxiters = 100, + w = 0.7298f0, + wdamp = 1.0f0, + debug = false) + sol_ref = Ref(gbest) + for i in 1:maxiters + update_particle_states_cpu!(prob, particles, sol_ref, w) + w = w * wdamp + end + return sol_ref[] +end diff --git a/src/ode_pso.jl b/src/ode_pso.jl index 0b779e2..461af5d 100644 --- a/src/ode_pso.jl +++ b/src/ode_pso.jl @@ -83,13 +83,13 @@ function parameter_estim_ode!(prob::ODEProblem, update_costs!(losses, gpu_particles; ndrange = length(losses)) best_particle = minimum(gpu_particles, - init = PSOGPU.PSOParticle(gbest.position, + init = PSOGPU.SPSOParticle(gbest.position, gbest.position, gbest.cost, gbest.position, gbest.cost)) - gbest = PSOGPU.PSOGBest(best_particle.best_position, best_particle.best_cost) + gbest = PSOGPU.SPSOGBest(best_particle.best_position, best_particle.best_cost) w = w * wdamp end return gbest diff --git a/src/pso_async_gpu.jl b/src/pso_async_gpu.jl deleted file mode 100644 index 86b1060..0000000 --- a/src/pso_async_gpu.jl +++ /dev/null @@ -1,69 +0,0 @@ -@kernel function update_particle_states_async!(prob, - gpu_particles, - gbest_ref, - w, wdamp, maxiters; - c1 = 1.4962f0, - c2 = 1.4962f0) - i = @index(Global, Linear) - if i <= length(gpu_particles) - gbest = gbest_ref[1] - - ## Access the particle - @inbounds particle = gpu_particles[i] - - ## Run all generations - for i in 1:maxiters - updated_velocity = w .* particle.velocity .+ - c1 .* rand(typeof(particle.velocity)) .* - (particle.best_position - - particle.position) .+ - c2 .* rand(typeof(particle.velocity)) .* - (gbest.position - particle.position) - - @set! particle.velocity = updated_velocity - - @set! particle.position = particle.position + particle.velocity - - update_pos = max(particle.position, prob.lb) - update_pos = min(update_pos, prob.ub) - @set! particle.position = update_pos - # @set! particle.position = min(particle.position, ub) - - @set! particle.cost = prob.f(particle.position, prob.p) - - if particle.cost < particle.best_cost - @set! particle.best_position = particle.position - @set! particle.best_cost = particle.cost - end - - if particle.best_cost < gbest.cost - @set! gbest.position = particle.best_position - @set! gbest.cost = particle.best_cost - end - w = w * wdamp - end - - @inbounds gpu_particles[i] = particle - - @inbounds gbest_ref[1] = gbest - end -end - -function pso_solve_async_gpu!(prob, - gbest, - gpu_particles; - maxiters = 100, - w = 0.7298f0, - wdamp = 1.0f0, - debug = false) - - ## Initialize stuff - - backend = get_backend(gpu_particles) - - kernel = update_particle_states_async!(backend) - kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange = length(gpu_particles)) - - best_particle = minimum(gpu_particles) - return PSOGBest(best_particle.best_position, best_particle.best_cost) -end diff --git a/src/pso_cpu.jl b/src/pso_cpu.jl deleted file mode 100644 index 84ff81b..0000000 --- a/src/pso_cpu.jl +++ /dev/null @@ -1,179 +0,0 @@ -# Based on: https://stackoverflow.com/questions/65342388/why-my-code-in-julia-is-getting-slower-for-higher-iteration - -mutable struct Particle{T} - position::Array{T, 1} - velocity::Array{T, 1} - cost::T - best_position::Array{T, 1} - best_cost::T -end -mutable struct Gbest{T} - position::Array{T, 1} - cost::T -end - -function _init_particles(prob, population) - dim = length(prob.u0) - lb = prob.lb - ub = prob.ub - cost_func = prob.f - - if lb === nothing || (all(isinf, lb) && all(isinf, ub)) - gbest_position = Array{eltype(prob.u0), 1}(undef, dim) - for i in 1:dim - if abs(prob.u0[i]) > 0 - gbest_position[i] = prob.u0[i] + rand(eltype(prob.u0)) * abs(prob.u0[i]) - else - gbest_position[i] = rand(eltype(prob.u0)) - end - end - else - gbest_position = uniform(dim, lb, ub) - end - gbest = Gbest(gbest_position, cost_func(gbest_position, prob.p)) - - particles = Particle[] - for i in 1:population - if lb === nothing || (all(isinf, lb) && all(isinf, ub)) - position = Array{eltype(prob.u0), 1}(undef, dim) - for i in 1:dim - if abs(prob.u0[i]) > 0 - position[i] = prob.u0[i] + rand(eltype(prob.u0)) * abs(prob.u0[i]) - else - position[i] = rand(eltype(prob.u0)) - end - end - else - position = uniform(dim, lb, ub) - end - velocity = zeros(eltype(position), dim) - cost = cost_func(position, prob.p) - best_position = copy(position) - best_cost = copy(cost) - push!(particles, Particle(position, velocity, cost, best_position, best_cost)) - - if best_cost < gbest.cost - gbest.position = copy(best_position) - gbest.cost = copy(best_cost) - end - end - return gbest, particles -end - -function PSO(prob::OptimizationProblem; - maxiters = 100, - population = 100, - c1 = 1.4962, - c2 = 1.4962, - w = 0.7298, - wdamp = 1.0, - verbose = false) - dim = length(prob.u0) - lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), dim) : prob.lb - ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), dim) : prob.ub - cost_func = prob.f - p = prob.p - - gbest, particles = _init_particles(prob, population) - - # main loop - for iter in 1:maxiters - Threads.@threads for i in 1:population - particles[i].velocity .= w .* particles[i].velocity .+ - c1 .* rand(dim) .* (particles[i].best_position .- - particles[i].position) .+ - c2 .* rand(dim) .* - (gbest.position .- particles[i].position) - - particles[i].position .= particles[i].position .+ particles[i].velocity - particles[i].position .= max.(particles[i].position, lb) - particles[i].position .= min.(particles[i].position, ub) - - particles[i].cost = cost_func(particles[i].position, prob.p) - - if particles[i].cost < particles[i].best_cost - particles[i].best_position = copy(particles[i].position) - particles[i].best_cost = copy(particles[i].cost) - - if particles[i].best_cost < gbest.cost - gbest.position = copy(particles[i].best_position) - gbest.cost = copy(particles[i].best_cost) - end - end - end - w = w * wdamp - if verbose && iter % 50 == 1 - println("Iteration " * string(iter) * ": Best Cost = " * string(gbest.cost)) - println("Best Position = " * string(gbest.position)) - println() - end - end - gbest -end - -function update_particle_states_cpu!(prob, particles, gbest_ref, w, lb, ub; c1 = 1.4962f0, - c2 = 1.4962f0) - # i = 1 - - ## Access the particle - - # gpu_particles = convert(MArray, gpu_particles) - - gbest = gbest_ref[] - - for i in eachindex(particles) - @inbounds particle = particles[i] - ## Update velocity - - updated_velocity = w * particle.velocity + - c1 .* rand(typeof(particle.velocity)) .* - (particle.best_position - - particle.position) + - c2 * rand(typeof(particle.velocity)) .* - (gbest.position - particle.position) - - @set! particle.velocity = updated_velocity - - @set! particle.position = particle.position + particle.velocity - - update_pos = max(particle.position, lb) - update_pos = min(update_pos, ub) - @set! particle.position = update_pos - # @set! particle.position = min(particle.position, ub) - - @set! particle.cost = prob.f(particle.position, prob.p) - - if particle.cost < particle.best_cost - @set! particle.best_position = particle.position - @set! particle.best_cost = particle.cost - end - - if particle.best_cost < gbest.cost - @set! gbest.position = particle.best_position - @set! gbest.cost = particle.best_cost - end - - particles[i] = particle - end - gbest_ref[] = gbest - return nothing -end - -function pso_solve_cpu!(prob, - gbest, - cpu_particles; - maxiters = 100, - w = 0.7298f0, - wdamp = 1.0f0, - debug = false) - sol_ref = Ref(gbest) - lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb - ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub - for i in 1:maxiters - ## Invoke GPU Kernel here - update_particle_states_cpu!(prob, cpu_particles, sol_ref, w, lb, ub) - w = w * wdamp - end - - return sol_ref[] -end diff --git a/src/pso_gpu.jl b/src/pso_gpu.jl deleted file mode 100644 index d127cdf..0000000 --- a/src/pso_gpu.jl +++ /dev/null @@ -1,73 +0,0 @@ -@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w; c1 = 1.4962f0, - c2 = 1.4962f0) - i = @index(Global, Linear) - if i <= length(gpu_particles) - # i = 1 - - ## Access the particle - - @inbounds gbest = gbest_ref[1] - - # gpu_particles = convert(MArray, gpu_particles) - - @inbounds particle = gpu_particles[i] - ## Update velocity - - updated_velocity = w .* particle.velocity .+ - c1 .* rand(typeof(particle.velocity)) .* - (particle.best_position - - particle.position) .+ - c2 .* rand(typeof(particle.velocity)) .* - (gbest.position - particle.position) - - @set! particle.velocity = updated_velocity - - @set! particle.position = particle.position + particle.velocity - - update_pos = max(particle.position, prob.lb) - update_pos = min(update_pos, prob.ub) - @set! particle.position = update_pos - # @set! particle.position = min(particle.position, ub) - - @set! particle.cost = prob.f(particle.position, prob.p) - - if particle.cost < particle.best_cost - @set! particle.best_position = particle.position - @set! particle.best_cost = particle.cost - end - - if particle.best_cost < gbest.cost - @set! gbest.position = particle.best_position - @set! gbest.cost = particle.best_cost - end - - @inbounds gpu_particles[i] = particle - - @inbounds gbest_ref[1] = gbest - - # gpu_particles = convert(SArray, gpu_particles) - end -end - -function pso_solve_gpu!(prob, - gbest, - gpu_particles; - maxiters = 100, - w = 0.7298f0, - wdamp = 1.0f0, - debug = false) - - ## Initialize stuff - - backend = get_backend(gpu_particles) - - kernel = update_particle_states!(backend) - - for i in 1:maxiters - ## Invoke GPU Kernel here - kernel(prob, gpu_particles, gbest, w; ndrange = length(gpu_particles)) - w = w * wdamp - end - - return Array(gbest)[1] -end diff --git a/src/pso_sync_cpu.jl b/src/pso_sync_cpu.jl deleted file mode 100644 index db696e0..0000000 --- a/src/pso_sync_cpu.jl +++ /dev/null @@ -1,43 +0,0 @@ -function update_particles(prob, gpu_particles, gbest_ref, w; c1 = 1.4962f0, - c2 = 1.4962f0) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - i > length(gpu_particles) && return - - # i = 1 - - ## Access the particle - - @inbounds gbest = gbest_ref[1] - - # gpu_particles = convert(MArray, gpu_particles) - - @inbounds particle = gpu_particles[i] - ## Update velocity - - updated_velocity = w .* particle.velocity .+ - c1 .* rand(typeof(particle.velocity)) .* (particle.best_position - - particle.position) .+ - c2 .* rand(typeof(particle.velocity)) .* - (gbest.position - particle.position) - - @set! particle.velocity = updated_velocity - - @set! particle.position = particle.position + particle.velocity - - update_pos = max(particle.position, prob.lb) - update_pos = min(update_pos, prob.ub) - @set! particle.position = update_pos - # @set! particle.position = min(particle.position, ub) - if particle.cost < particle.best_cost - @set! particle.best_position = particle.position - @set! particle.best_cost = particle.cost - end - - @inbounds gpu_particles[i] = particle - - return nothing -end - -gbest = minimum(gpu_particles) - -@set! particle.cost = prob.f(particle.position, prob.p) diff --git a/src/pso_sync_gpu.jl b/src/pso_sync_gpu.jl deleted file mode 100644 index d941c17..0000000 --- a/src/pso_sync_gpu.jl +++ /dev/null @@ -1,59 +0,0 @@ -@kernel function _update_particle_states!(prob, gpu_particles, gbest, w; c1 = 1.4962f0, - c2 = 1.4962f0) - i = @index(Global, Linear) - if i <= length(gpu_particles) - # gpu_particles = convert(MArray, gpu_particles) - - @inbounds particle = gpu_particles[i] - ## Update velocity - - updated_velocity = w .* particle.velocity .+ - c1 .* rand(typeof(particle.velocity)) .* - (particle.best_position - - particle.position) .+ - c2 .* rand(typeof(particle.velocity)) .* - (gbest.position - particle.position) - - @set! particle.velocity = updated_velocity - - @set! particle.position = particle.position + particle.velocity - - update_pos = max(particle.position, prob.lb) - update_pos = min(update_pos, prob.ub) - @set! particle.position = update_pos - - @set! particle.cost = prob.f(particle.position, prob.p) - - if particle.cost < particle.best_cost - @set! particle.best_position = particle.position - @set! particle.best_cost = particle.cost - end - - @inbounds gpu_particles[i] = particle - end -end - -function pso_solve_sync_gpu!(prob, - gbest, - gpu_particles; - maxiters = 100, - w = 0.7298f0, - wdamp = 1.0f0, - debug = false) - backend = get_backend(gpu_particles) - - update_particle_kernel = _update_particle_states!(backend) - - for i in 1:maxiters - update_particle_kernel(prob, - gpu_particles, - gbest, - w; - ndrange = length(gpu_particles)) - best_particle = minimum(gpu_particles) - gbest = PSOGBest(best_particle.position, best_particle.best_cost) - w = w * wdamp - end - - return gbest -end diff --git a/src/solve.jl b/src/solve.jl new file mode 100644 index 0000000..9854ee9 --- /dev/null +++ b/src/solve.jl @@ -0,0 +1,65 @@ +function SciMLBase.__solve(prob::OptimizationProblem, opt::PSOAlogrithm, args...; kwargs...) + lb, ub = check_init_bounds(prob) + prob = remake(prob; lb = lb, ub = ub) + + gbest = pso_solve(prob, opt, args...; kwargs...) + + SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt, + gbest.position, gbest.cost) +end + +function pso_solve(prob::OptimizationProblem, + opt::ParallelPSOKernel, + args...; + kwargs...) + backend = opt.backend + @assert prob.u0 isa SArray + init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0)) + # TODO: Do the equivalent of cu()/roc() + particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles) + gpu_particles = KernelAbstractions.allocate(backend, + particles_eltype, + size(particles)) + copyto!(gpu_particles, particles) + gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,)) + copyto!(gpu_init_gbest, [init_gbest]) + + gbest = vectorized_solve!(prob, + gpu_init_gbest, + gpu_particles, + opt, + Val(opt.global_update), + args...; + kwargs...) + gbest +end + +function pso_solve(prob::OptimizationProblem, + opt::ParallelPSOArray, + args...; + kwargs...) + init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0)) + gbest = vectorized_solve!(prob, init_gbest, particles, opt, args...; kwargs...) + gbest +end + +function pso_solve(prob::OptimizationProblem, opt::SerialPSO, args...; kwargs...) + init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0)) + gbest = vectorized_solve!(prob, init_gbest, particles, opt; kwargs...) + gbest +end + +function pso_solve(prob::OptimizationProblem, + opt::ParallelSyncPSOKernel, + args...; + kwargs...) + backend = opt.backend + init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0)) + particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles) + gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles)) + copyto!(gpu_particles, particles) + init_gbest = init_gbest + + gbest = vectorized_solve!(prob, init_gbest, gpu_particles, opt; kwargs...) + gbest +end diff --git a/src/utils.jl b/src/utils.jl index c7a0d12..0f9d1d2 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -6,7 +6,7 @@ function uniform(dim::Int, lb::AbstractArray{T}, ub::AbstractArray{T}) where {T} return arr end -function init_particles(prob, n_particles) +function init_particles(prob, n_particles, ::Type{T}) where {T <: SArray} dim = length(prob.u0) lb = prob.lb ub = prob.ub @@ -28,7 +28,7 @@ function init_particles(prob, n_particles) gbest_position = SVector{length(gbest_position), eltype(gbest_position)}(gbest_position) gbest_cost = cost_func(gbest_position, p) - particles = PSOParticle[] + particles = SPSOParticle[] for i in 1:n_particles if lb === nothing || (all(isinf, lb) && all(isinf, ub)) position = Array{eltype(prob.u0), 1}(undef, dim) @@ -45,20 +45,20 @@ function init_particles(prob, n_particles) position = SVector{length(position), eltype(position)}(position) velocity = @SArray zeros(eltype(position), dim) cost = cost_func(position, p) - best_position = copy(position) - best_cost = copy(cost) - push!(particles, PSOParticle(position, velocity, cost, best_position, best_cost)) + best_position = position + best_cost = cost + push!(particles, SPSOParticle(position, velocity, cost, best_position, best_cost)) if best_cost < gbest_cost - gbest_position = copy(best_position) - gbest_cost = copy(best_cost) + gbest_position = best_position + gbest_cost = best_cost end end - gbest = PSOGBest(gbest_position, gbest_cost) + gbest = SPSOGBest(gbest_position, gbest_cost) return gbest, convert(Vector{typeof(particles[1])}, particles) end -function init_particles(prob, population, ::CPU) +function init_particles(prob, n_particles, ::Type{T}) where {T <: AbstractArray} dim = length(prob.u0) lb = prob.lb ub = prob.ub @@ -76,11 +76,10 @@ function init_particles(prob, population, ::CPU) else gbest_position = uniform(dim, lb, ub) end + gbest = MPSOGBest(gbest_position, cost_func(gbest_position, prob.p)) - gbest = Gbest(gbest_position, cost_func(gbest_position, data_dict)) - - particles = Particle[] - for i in 1:population + particles = MPSOParticle[] + for i in 1:n_particles if lb === nothing || (all(isinf, lb) && all(isinf, ub)) position = Array{eltype(prob.u0), 1}(undef, dim) for i in 1:dim @@ -93,16 +92,26 @@ function init_particles(prob, population, ::CPU) else position = uniform(dim, lb, ub) end - velocity = zeros(dim) - cost = cost_func(position, data_dict) + velocity = zeros(eltype(position), dim) + cost = cost_func(position, prob.p) best_position = copy(position) best_cost = copy(cost) - push!(particles, Particle(position, velocity, cost, best_position, best_cost)) + push!(particles, MPSOParticle(position, velocity, cost, best_position, best_cost)) if best_cost < gbest.cost gbest.position = copy(best_position) gbest.cost = copy(best_cost) end end - return gbest, particles + return gbest, convert(Vector{typeof(particles[1])}, particles) +end + +function check_init_bounds(prob) + lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb + ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub + if prob.u0 isa SArray + lb = SVector{length(lb), eltype(lb)}(lb) + ub = SVector{length(ub), eltype(ub)}(ub) + end + lb, ub end diff --git a/test/Project.toml b/test/Project.toml index dadaeac..7b206a3 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,8 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" +SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/gpu.jl b/test/gpu.jl index fe97f68..f4aa32c 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -1,37 +1,47 @@ -Random.seed!(1234) +using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random -## Solving the rosenbrock problem -lb = @SArray ones(Float32, N) -lb = -1 * lb -ub = @SArray fill(Float32(10.0), N) +DEVICE = get(ENV, "GROUP", "CUDA") -function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) +@eval using $(Symbol(DEVICE)) + +if DEVICE == "CUDA" + backend = CUDABackend() +elseif DEVICE == "AMDGPU" + backend = ROCBackend() end -x0 = @SArray zeros(Float32, N) -p = @SArray Float32[1.0, 100.0] +@testset "Rosenbrock GPU tests $(N)" for N in 2:4 + Random.seed!(1234) + + ## Solving the rosenbrock problem + lb = @SArray ones(Float32, N) + lb = -1 * lb + ub = @SArray fill(Float32(10.0), N) -prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) + function rosenbrock(x, p) + sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + end -n_particles = 1000 + x0 = @SArray zeros(Float32, N) + p = @SArray Float32[1.0, 100.0] -sol = solve(prob, ParallelPSOKernel(n_particles; backend), maxiters = 500) + prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) -@test sol.objective < 1e-4 + n_particles = 100 -sol = solve(prob, - ParallelSyncPSO(n_particles, backend), - maxiters = 500) + sol = solve(prob, ParallelPSOKernel(n_particles; backend), maxiters = 500) -@test sol.objective < 1e-4 + @test sol.retcode == ReturnCode.Default -prob = OptimizationProblem(rosenbrock, x0, p) + sol = solve(prob, + ParallelPSOKernel(n_particles; backend, global_update = false), + maxiters = 500) -n_particles = 2000 + @test sol.retcode == ReturnCode.Default -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = true), - maxiters = 500) + sol = solve(prob, + ParallelSyncPSOKernel(n_particles, backend), + maxiters = 500) -@test sol.objective < 1e-4 + @test sol.objective < 6e-4 +end diff --git a/test/regression.jl b/test/regression.jl index 17bc486..9a5b2af 100644 --- a/test/regression.jl +++ b/test/regression.jl @@ -1,63 +1,76 @@ -## Solving the rosenbrock problem -Random.seed!(1) +using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random -lb = @SArray ones(Float32, N) -lb = -1 * lb -ub = @SArray fill(Float32(10.0), N) +@testset "Rosenbrock test dimension = $(N)" for N in 2:4 -function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) -end + ## Solving the rosenbrock problem + Random.seed!(1234) + lb = @SArray ones(Float32, N) + lb = -1 * lb + ub = @SArray fill(Float32(10.0), N) + + function rosenbrock(x, p) + sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + end + + x0 = @SArray zeros(Float32, N) + p = @SArray Float32[1.0, 100.0] -x0 = @SArray zeros(Float32, N) -p = @SArray Float32[1.0, 100.0] + array_prob = OptimizationProblem(rosenbrock, + zeros(Float32, N), + Float32[1.0, 100.0]; + lb = lb, + ub = ub) -prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) + prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub) -n_particles = 1000 + n_particles = 1000 -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = true), - maxiters = 500) + sol = solve(array_prob, + ParallelPSOArray(n_particles), + maxiters = 500) -@test sol.objective < 1e-4 + @test sol.objective < 3e-4 -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = false), - maxiters = 500) + sol = solve(prob, + SerialPSO(n_particles), + maxiters = 600) -@test sol.objective < 1e-4 + @test sol.objective < 1e-4 -lb = @SVector fill(Float32(-Inf), N) -ub = @SVector fill(Float32(Inf), N) -prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub, N) + lb = @SVector fill(Float32(-Inf), N) + ub = @SVector fill(Float32(Inf), N) -n_particles = 2000 + array_prob = remake(array_prob; lb = lb, ub = ub) + prob = remake(prob; lb = lb, ub = ub) -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = true), - maxiters = 500) + n_particles = 2000 -@test sol.objective < 1e-4 + sol = solve(array_prob, + ParallelPSOArray(n_particles), + maxiters = 500) -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = false), - maxiters = 500) + @test sol.objective < 1e-4 -@test sol.objective < 1e-4 + sol = solve(prob, + SerialPSO(n_particles), + maxiters = 500) -prob = OptimizationProblem(rosenbrock, x0, p) + @test sol.objective < 1e-4 -n_particles = 2000 + array_prob = remake(array_prob; lb = nothing, ub = nothing) + prob = remake(prob; lb = nothing, ub = nothing) -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = true), - maxiters = 500) + n_particles = 2000 -@test sol.objective < 1e-4 + sol = solve(array_prob, + ParallelPSOArray(n_particles), + maxiters = 500) -sol = solve(prob, - ParallelPSOKernel(n_particles; threaded = false), - maxiters = 500) + @test sol.objective < 1e-4 -@test sol.objective < 1e-4 + sol = solve(prob, + SerialPSO(n_particles), + maxiters = 500) + + @test sol.objective < 1e-4 +end diff --git a/test/runtests.jl b/test/runtests.jl index 9c840e7..b45e416 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,22 +1,10 @@ -using PSOGPU -using Test, StaticArrays, LinearAlgebra, Random +using SafeTestsets +using Test const GROUP = get(ENV, "GROUP", "CPU") -@testset "Rosenbrock test dimension = $(n)" for n in 2:4 - global N = n - include("./regression.jl") -end +@safetestset "Regression tests" include("./regression.jl") if GROUP != "CPU" - @eval using $(Symbol(GROUP)) - if GROUP == "CUDA" - backend = CUDABackend() - elseif GROUP == "AMDGPU" - backend = ROCBackend() - end - - @testset "Rosenbrock on gpu" begin - include("./gpu.jl") - end + @safetestset "GPU optimizers tests" include("./gpu.jl") end