diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl
index f4110e6..fa1aaef 100644
--- a/src/PSOGPU.jl
+++ b/src/PSOGPU.jl
@@ -6,120 +6,51 @@ import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
 
 ## Use lb and ub either as StaticArray or pass them separately as CuArrays
 ## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension
-struct PSOParticle{T1, T2 <: eltype(T1)}
+struct SPSOParticle{T1, T2 <: eltype(T1)}
     position::T1
     velocity::T1
     cost::T2
     best_position::T1
     best_cost::T2
 end
-
-struct PSOGBest{T1, T2 <: eltype(T1)}
+struct SPSOGBest{T1, T2 <: eltype(T1)}
     position::T1
     cost::T2
 end
 
-struct ParallelPSOKernel{Backend}
-    num_particles::Int
-    async::Bool
-    threaded::Bool
-    backend::Backend
-end
-struct ParallelSyncPSO{Backend}
-    num_particles::Int
-    backend::Backend
-end
-
-function ParallelPSOKernel(num_particles::Int;
-        async = false, threaded = false, backend = CPU())
-    ParallelPSOKernel(num_particles, async, threaded, backend)
-end
-
-SciMLBase.allowsbounds(::ParallelPSOKernel) = true
-SciMLBase.allowsbounds(::ParallelSyncPSO) = true
-# SciMLBase.requiresbounds(::ParallelPSOKernel) = true
-
-include("./pso_cpu.jl")
-include("./pso_gpu.jl")
-include("./pso_async_gpu.jl")
-include("./utils.jl")
-include("./pso_sync_gpu.jl")
-include("./ode_pso.jl")
-
-function SciMLBase.__solve(prob::OptimizationProblem,
-        opt::ParallelPSOKernel,
-        args...;
-        kwargs...)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
-
-    prob = remake(prob; lb = lb, ub = ub)
-
-    ## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
-    if opt.backend isa CPU
-        if opt.threaded
-            gbest = PSO(prob; population = opt.num_particles, kwargs...)
-        else
-            init_gbest, particles = init_particles(prob, opt.num_particles)
-            gbest = pso_solve_cpu!(prob, init_gbest, particles; kwargs...)
-        end
-    else
-        backend = opt.backend
-        init_gbest, particles = init_particles(prob, opt.num_particles)
-        # TODO: Do the equivalent of cu()/roc()
-        particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
-        gpu_particles = KernelAbstractions.allocate(backend,
-            particles_eltype,
-            size(particles))
-        copyto!(gpu_particles, particles)
-        gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
-        copyto!(gpu_init_gbest, [init_gbest])
-        if opt.async
-            gbest = pso_solve_async_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
-        else
-            gbest = pso_solve_gpu!(prob, gpu_init_gbest, gpu_particles; kwargs...)
-        end
-    end
-
-    SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        gbest.position, gbest.cost)
+mutable struct MPSOParticle{T}
+    position::AbstractArray{T}
+    velocity::AbstractArray{T}
+    cost::T
+    best_position::AbstractArray{T}
+    best_cost::T
 end
-
-function SciMLBase.__solve(prob::OptimizationProblem,
-        opt::ParallelSyncPSO,
-        args...;
-        kwargs...)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
-
-    prob = remake(prob; lb = lb, ub = ub)
-    backend = opt.backend
-    init_gbest, particles = init_particles(prob, opt.num_particles)
-    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
-    gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
-    copyto!(gpu_particles, particles)
-    init_gbest = init_gbest
-    gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
-
-    SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        gbest.position, gbest.cost)
+mutable struct MPSOGBest{T}
+    position::AbstractArray{T}
+    cost::T
 end
 
-using Base
-
 ## required overloads for min or max computation on particles
-function Base.isless(a::PSOGPU.PSOParticle{T1, T2},
-        b::PSOGPU.PSOParticle{T1, T2}) where {T1, T2}
+function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
+        b::PSOGPU.SPSOParticle{T1, T2}) where {T1, T2}
     a.best_cost < b.best_cost
 end
 
-function Base.typemax(::Type{PSOGPU.PSOParticle{T1, T2}}) where {T1, T2}
-    PSOGPU.PSOParticle{T1, T2}(similar(T1),
+function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
+    PSOGPU.SPSOParticle{T1, T2}(similar(T1),
         similar(T1),
         typemax(T2),
         similar(T1),
         typemax(T2))
 end
 
-export ParallelPSOKernel, ParallelSyncPSO, OptimizationProblem, solve
+include("./algorithms.jl")
+include("./utils.jl")
+include("./ode_pso.jl")
+include("./kernels.jl")
+include("./lowerlevel_solve.jl")
+include("./solve.jl")
+
+export ParallelPSOKernel,
+    ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve
 end
diff --git a/src/algorithms.jl b/src/algorithms.jl
new file mode 100644
index 0000000..bf12e80
--- /dev/null
+++ b/src/algorithms.jl
@@ -0,0 +1,99 @@
+
+abstract type PSOAlogrithm end
+
+"""
+```julia
+ParallelPSOKernel(num_particles = 100)
+```
+
+Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
+on a GPU. Static Arrays for parameters in the `OptimizationProblem` are required for successful GPU compilation.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+- global_update: defaults to `true`. Setting it to false allows particles to evolve completely on their own,
+  i.e. no information is sent about the global best position.
+- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
+
+## Limitations
+
+Running the optimization with `global_update=true` updates the global best positions with possible thread races.
+This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
+updates can be used to fix this.
+
+"""
+struct ParallelPSOKernel{Backend} <: PSOAlogrithm
+    num_particles::Int
+    global_update::Bool
+    backend::Backend
+end
+
+"""
+```julia
+ParallelSyncPSOKernel(num_particles = 100)
+```
+
+Particle Swarm Optimization on a GPU. Creates and launches a kernel which updates the particle states in parallel
+on a GPU. However, it requires a synchronization after each generation to calculate the global best position of the particles.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+- backend: defaults to `CPU()`. The KernelAbstractions backend for performing the computation.
+
+"""
+struct ParallelSyncPSOKernel{Backend} <: PSOAlogrithm
+    num_particles::Int
+    backend::Backend
+end
+
+"""
+```julia
+ParallelPSOArray(num_particles = 100)
+```
+Particle Swarm Optimization on a CPU. It keeps the arrays used in particle data structure
+to be Julia's `Array`, which may be better for high-dimensional problems.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+
+
+## Limitations
+
+Running the optimization updates the global best positions with possible thread races.
+This is the price to be paid to fuse all the updates into a single kernel. Techniques such as queue lock and atomic
+updates can be used to fix this.
+
+"""
+struct ParallelPSOArray <: PSOAlogrithm
+    num_particles::Int
+end
+
+"""
+```julia
+SerialPSO(num_particles = 100)
+```
+Serial Particle Swarm Optimization on a CPU.
+
+## Positional arguments:
+
+- num_particles: Number of particles in the simulation
+
+"""
+struct SerialPSO <: PSOAlogrithm
+    num_particles::Int
+end
+
+function ParallelPSOKernel(num_particles::Int;
+        global_update = true, backend = CPU())
+    ParallelPSOKernel(num_particles, global_update, backend)
+end
+
+function ParallelSyncPSOKernel(num_particles::Int;
+        backend = CPU())
+    ParallelSyncPSOKernel(num_particles, backend)
+end
+
+SciMLBase.allowsbounds(::PSOAlogrithm) = true
diff --git a/src/kernels.jl b/src/kernels.jl
new file mode 100644
index 0000000..14655d6
--- /dev/null
+++ b/src/kernels.jl
@@ -0,0 +1,84 @@
+@inline function update_particle_state(particle, prob, gbest, w, c1, c2)
+    updated_velocity = w .* particle.velocity .+
+                       c1 .* rand(typeof(particle.velocity)) .*
+                       (particle.best_position -
+                        particle.position) .+
+                       c2 .* rand(typeof(particle.velocity)) .*
+                       (gbest.position - particle.position)
+
+    @set! particle.velocity = updated_velocity
+
+    @set! particle.position = particle.position + particle.velocity
+
+    update_pos = max.(particle.position, prob.lb)
+    update_pos = min.(update_pos, prob.ub)
+    @set! particle.position = update_pos
+
+    @set! particle.cost = prob.f(particle.position, prob.p)
+
+    if particle.cost < particle.best_cost
+        @set! particle.best_position = particle.position
+        @set! particle.best_cost = particle.cost
+    end
+    particle
+end
+
+@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
+        opt::ParallelPSOKernel; c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    @inbounds gbest = gbest_ref[1]
+    @inbounds particle = gpu_particles[i]
+
+    particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+
+    ## NOTE: This causes thread races to update global best particle.
+    if particle.best_cost < gbest.cost
+        @set! gbest.position = particle.best_position
+        @set! gbest.cost = particle.best_cost
+    end
+
+    @inbounds gbest_ref[1] = gbest
+
+    @inbounds gpu_particles[i] = particle
+end
+
+@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
+        opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    @inbounds particle = gpu_particles[i]
+
+    particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+
+    @inbounds gpu_particles[i] = particle
+end
+
+@kernel function update_particle_states_async!(prob,
+        gpu_particles,
+        gbest_ref,
+        w, wdamp, maxiters;
+        c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    i = @index(Global, Linear)
+
+    gbest = gbest_ref[1]
+
+    ## Access the particle
+    @inbounds particle = gpu_particles[i]
+
+    ## Run all generations
+    for i in 1:maxiters
+        particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+        if particle.best_cost < gbest.cost
+            @set! gbest.position = particle.best_position
+            @set! gbest.cost = particle.best_cost
+        end
+        w = w * wdamp
+    end
+
+    @inbounds gpu_particles[i] = particle
+    @inbounds gbest_ref[1] = gbest
+end
diff --git a/src/lowerlevel_solve.jl b/src/lowerlevel_solve.jl
new file mode 100644
index 0000000..f66f145
--- /dev/null
+++ b/src/lowerlevel_solve.jl
@@ -0,0 +1,141 @@
+function vectorized_solve!(prob,
+        gbest,
+        gpu_particles, opt::ParallelSyncPSOKernel;
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        debug = false)
+    backend = get_backend(gpu_particles)
+
+    update_particle_kernel = update_particle_states!(backend)
+
+    for i in 1:maxiters
+        update_particle_kernel(prob,
+            gpu_particles,
+            gbest,
+            w, opt;
+            ndrange = length(gpu_particles))
+        best_particle = minimum(gpu_particles)
+        gbest = SPSOGBest(best_particle.position, best_particle.best_cost)
+        w = w * wdamp
+    end
+
+    return gbest
+end
+
+function vectorized_solve!(prob,
+        gbest,
+        gpu_particles, opt::ParallelPSOKernel, ::Val{true};
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        debug = false)
+
+    ## Initialize stuff
+
+    backend = get_backend(gpu_particles)
+
+    kernel = update_particle_states!(backend)
+
+    for i in 1:maxiters
+        ## Invoke GPU Kernel here
+        kernel(prob, gpu_particles, gbest, w, opt; ndrange = length(gpu_particles))
+        w = w * wdamp
+    end
+
+    return Array(gbest)[1]
+end
+
+function vectorized_solve!(prob,
+        gbest,
+        gpu_particles, opt::ParallelPSOKernel, ::Val{false};
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        debug = false)
+    backend = get_backend(gpu_particles)
+
+    kernel = update_particle_states_async!(backend)
+    kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange = length(gpu_particles))
+
+    best_particle = minimum(gpu_particles)
+    return SPSOGBest(best_particle.best_position, best_particle.best_cost)
+end
+
+function vectorized_solve!(prob, gbest,
+        particles, opt::ParallelPSOArray;
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        c1 = 1.4962f0,
+        c2 = 1.4962f0,
+        verbose = false)
+    cost_func = prob.f
+    num_particles = length(particles)
+    rand_eltype = eltype(particles[1].velocity)
+    # main loop
+
+    for iter in 1:maxiters
+        Threads.@threads for i in 1:num_particles
+            particles[i].velocity .= w .* particles[i].velocity .+
+                                     c1 .* rand.(rand_eltype) .*
+                                     (particles[i].best_position .-
+                                      particles[i].position) .+
+                                     c2 .* rand.(rand_eltype) .*
+                                     (gbest.position .- particles[i].position)
+
+            particles[i].position .= particles[i].position .+ particles[i].velocity
+            particles[i].position .= max.(particles[i].position, prob.lb)
+            particles[i].position .= min.(particles[i].position, prob.ub)
+
+            particles[i].cost = cost_func(particles[i].position, prob.p)
+
+            if particles[i].cost < particles[i].best_cost
+                copy!(particles[i].best_position, particles[i].position)
+                particles[i].best_cost = particles[i].cost
+
+                ## Possible race condition here
+                if particles[i].best_cost < gbest.cost
+                    copy!(gbest.position, particles[i].best_position)
+                    gbest.cost = particles[i].best_cost
+                end
+            end
+        end
+        w = w * wdamp
+    end
+    gbest
+end
+
+function update_particle_states_cpu!(prob, particles, gbest_ref, w; c1 = 1.4962f0,
+        c2 = 1.4962f0)
+    gbest = gbest_ref[]
+
+    for i in eachindex(particles)
+        @inbounds particle = particles[i]
+        particle = update_particle_state(particle, prob, gbest, w, c1, c2)
+
+        if particle.best_cost < gbest.cost
+            @set! gbest.position = particle.best_position
+            @set! gbest.cost = particle.best_cost
+        end
+
+        particles[i] = particle
+    end
+    gbest_ref[] = gbest
+    return nothing
+end
+
+function vectorized_solve!(prob,
+        gbest,
+        particles, opt::SerialPSO;
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        debug = false)
+    sol_ref = Ref(gbest)
+    for i in 1:maxiters
+        update_particle_states_cpu!(prob, particles, sol_ref, w)
+        w = w * wdamp
+    end
+    return sol_ref[]
+end
diff --git a/src/ode_pso.jl b/src/ode_pso.jl
index 0b779e2..461af5d 100644
--- a/src/ode_pso.jl
+++ b/src/ode_pso.jl
@@ -83,13 +83,13 @@ function parameter_estim_ode!(prob::ODEProblem,
         update_costs!(losses, gpu_particles; ndrange = length(losses))
 
         best_particle = minimum(gpu_particles,
-            init = PSOGPU.PSOParticle(gbest.position,
+            init = PSOGPU.SPSOParticle(gbest.position,
                 gbest.position,
                 gbest.cost,
                 gbest.position,
                 gbest.cost))
 
-        gbest = PSOGPU.PSOGBest(best_particle.best_position, best_particle.best_cost)
+        gbest = PSOGPU.SPSOGBest(best_particle.best_position, best_particle.best_cost)
         w = w * wdamp
     end
     return gbest
diff --git a/src/pso_async_gpu.jl b/src/pso_async_gpu.jl
deleted file mode 100644
index 86b1060..0000000
--- a/src/pso_async_gpu.jl
+++ /dev/null
@@ -1,69 +0,0 @@
-@kernel function update_particle_states_async!(prob,
-        gpu_particles,
-        gbest_ref,
-        w, wdamp, maxiters;
-        c1 = 1.4962f0,
-        c2 = 1.4962f0)
-    i = @index(Global, Linear)
-    if i <= length(gpu_particles)
-        gbest = gbest_ref[1]
-
-        ## Access the particle
-        @inbounds particle = gpu_particles[i]
-
-        ## Run all generations
-        for i in 1:maxiters
-            updated_velocity = w .* particle.velocity .+
-                               c1 .* rand(typeof(particle.velocity)) .*
-                               (particle.best_position -
-                                particle.position) .+
-                               c2 .* rand(typeof(particle.velocity)) .*
-                               (gbest.position - particle.position)
-
-            @set! particle.velocity = updated_velocity
-
-            @set! particle.position = particle.position + particle.velocity
-
-            update_pos = max(particle.position, prob.lb)
-            update_pos = min(update_pos, prob.ub)
-            @set! particle.position = update_pos
-            # @set! particle.position = min(particle.position, ub)
-
-            @set! particle.cost = prob.f(particle.position, prob.p)
-
-            if particle.cost < particle.best_cost
-                @set! particle.best_position = particle.position
-                @set! particle.best_cost = particle.cost
-            end
-
-            if particle.best_cost < gbest.cost
-                @set! gbest.position = particle.best_position
-                @set! gbest.cost = particle.best_cost
-            end
-            w = w * wdamp
-        end
-
-        @inbounds gpu_particles[i] = particle
-
-        @inbounds gbest_ref[1] = gbest
-    end
-end
-
-function pso_solve_async_gpu!(prob,
-        gbest,
-        gpu_particles;
-        maxiters = 100,
-        w = 0.7298f0,
-        wdamp = 1.0f0,
-        debug = false)
-
-    ## Initialize stuff
-
-    backend = get_backend(gpu_particles)
-
-    kernel = update_particle_states_async!(backend)
-    kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange = length(gpu_particles))
-
-    best_particle = minimum(gpu_particles)
-    return PSOGBest(best_particle.best_position, best_particle.best_cost)
-end
diff --git a/src/pso_cpu.jl b/src/pso_cpu.jl
deleted file mode 100644
index 84ff81b..0000000
--- a/src/pso_cpu.jl
+++ /dev/null
@@ -1,179 +0,0 @@
-# Based on: https://stackoverflow.com/questions/65342388/why-my-code-in-julia-is-getting-slower-for-higher-iteration
-
-mutable struct Particle{T}
-    position::Array{T, 1}
-    velocity::Array{T, 1}
-    cost::T
-    best_position::Array{T, 1}
-    best_cost::T
-end
-mutable struct Gbest{T}
-    position::Array{T, 1}
-    cost::T
-end
-
-function _init_particles(prob, population)
-    dim = length(prob.u0)
-    lb = prob.lb
-    ub = prob.ub
-    cost_func = prob.f
-
-    if lb === nothing || (all(isinf, lb) && all(isinf, ub))
-        gbest_position = Array{eltype(prob.u0), 1}(undef, dim)
-        for i in 1:dim
-            if abs(prob.u0[i]) > 0
-                gbest_position[i] = prob.u0[i] + rand(eltype(prob.u0)) * abs(prob.u0[i])
-            else
-                gbest_position[i] = rand(eltype(prob.u0))
-            end
-        end
-    else
-        gbest_position = uniform(dim, lb, ub)
-    end
-    gbest = Gbest(gbest_position, cost_func(gbest_position, prob.p))
-
-    particles = Particle[]
-    for i in 1:population
-        if lb === nothing || (all(isinf, lb) && all(isinf, ub))
-            position = Array{eltype(prob.u0), 1}(undef, dim)
-            for i in 1:dim
-                if abs(prob.u0[i]) > 0
-                    position[i] = prob.u0[i] + rand(eltype(prob.u0)) * abs(prob.u0[i])
-                else
-                    position[i] = rand(eltype(prob.u0))
-                end
-            end
-        else
-            position = uniform(dim, lb, ub)
-        end
-        velocity = zeros(eltype(position), dim)
-        cost = cost_func(position, prob.p)
-        best_position = copy(position)
-        best_cost = copy(cost)
-        push!(particles, Particle(position, velocity, cost, best_position, best_cost))
-
-        if best_cost < gbest.cost
-            gbest.position = copy(best_position)
-            gbest.cost = copy(best_cost)
-        end
-    end
-    return gbest, particles
-end
-
-function PSO(prob::OptimizationProblem;
-        maxiters = 100,
-        population = 100,
-        c1 = 1.4962,
-        c2 = 1.4962,
-        w = 0.7298,
-        wdamp = 1.0,
-        verbose = false)
-    dim = length(prob.u0)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), dim) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), dim) : prob.ub
-    cost_func = prob.f
-    p = prob.p
-
-    gbest, particles = _init_particles(prob, population)
-
-    # main loop
-    for iter in 1:maxiters
-        Threads.@threads for i in 1:population
-            particles[i].velocity .= w .* particles[i].velocity .+
-                                     c1 .* rand(dim) .* (particles[i].best_position .-
-                                      particles[i].position) .+
-                                     c2 .* rand(dim) .*
-                                     (gbest.position .- particles[i].position)
-
-            particles[i].position .= particles[i].position .+ particles[i].velocity
-            particles[i].position .= max.(particles[i].position, lb)
-            particles[i].position .= min.(particles[i].position, ub)
-
-            particles[i].cost = cost_func(particles[i].position, prob.p)
-
-            if particles[i].cost < particles[i].best_cost
-                particles[i].best_position = copy(particles[i].position)
-                particles[i].best_cost = copy(particles[i].cost)
-
-                if particles[i].best_cost < gbest.cost
-                    gbest.position = copy(particles[i].best_position)
-                    gbest.cost = copy(particles[i].best_cost)
-                end
-            end
-        end
-        w = w * wdamp
-        if verbose && iter % 50 == 1
-            println("Iteration " * string(iter) * ": Best Cost = " * string(gbest.cost))
-            println("Best Position = " * string(gbest.position))
-            println()
-        end
-    end
-    gbest
-end
-
-function update_particle_states_cpu!(prob, particles, gbest_ref, w, lb, ub; c1 = 1.4962f0,
-        c2 = 1.4962f0)
-    # i = 1
-
-    ## Access the particle
-
-    # gpu_particles = convert(MArray, gpu_particles)
-
-    gbest = gbest_ref[]
-
-    for i in eachindex(particles)
-        @inbounds particle = particles[i]
-        ## Update velocity
-
-        updated_velocity = w * particle.velocity +
-                           c1 .* rand(typeof(particle.velocity)) .*
-                           (particle.best_position -
-                            particle.position) +
-                           c2 * rand(typeof(particle.velocity)) .*
-                           (gbest.position - particle.position)
-
-        @set! particle.velocity = updated_velocity
-
-        @set! particle.position = particle.position + particle.velocity
-
-        update_pos = max(particle.position, lb)
-        update_pos = min(update_pos, ub)
-        @set! particle.position = update_pos
-        # @set! particle.position = min(particle.position, ub)
-
-        @set! particle.cost = prob.f(particle.position, prob.p)
-
-        if particle.cost < particle.best_cost
-            @set! particle.best_position = particle.position
-            @set! particle.best_cost = particle.cost
-        end
-
-        if particle.best_cost < gbest.cost
-            @set! gbest.position = particle.best_position
-            @set! gbest.cost = particle.best_cost
-        end
-
-        particles[i] = particle
-    end
-    gbest_ref[] = gbest
-    return nothing
-end
-
-function pso_solve_cpu!(prob,
-        gbest,
-        cpu_particles;
-        maxiters = 100,
-        w = 0.7298f0,
-        wdamp = 1.0f0,
-        debug = false)
-    sol_ref = Ref(gbest)
-    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
-    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
-    for i in 1:maxiters
-        ## Invoke GPU Kernel here
-        update_particle_states_cpu!(prob, cpu_particles, sol_ref, w, lb, ub)
-        w = w * wdamp
-    end
-
-    return sol_ref[]
-end
diff --git a/src/pso_gpu.jl b/src/pso_gpu.jl
deleted file mode 100644
index d127cdf..0000000
--- a/src/pso_gpu.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w; c1 = 1.4962f0,
-        c2 = 1.4962f0)
-    i = @index(Global, Linear)
-    if i <= length(gpu_particles)
-        # i = 1
-
-        ## Access the particle
-
-        @inbounds gbest = gbest_ref[1]
-
-        # gpu_particles = convert(MArray, gpu_particles)
-
-        @inbounds particle = gpu_particles[i]
-        ## Update velocity
-
-        updated_velocity = w .* particle.velocity .+
-                           c1 .* rand(typeof(particle.velocity)) .*
-                           (particle.best_position -
-                            particle.position) .+
-                           c2 .* rand(typeof(particle.velocity)) .*
-                           (gbest.position - particle.position)
-
-        @set! particle.velocity = updated_velocity
-
-        @set! particle.position = particle.position + particle.velocity
-
-        update_pos = max(particle.position, prob.lb)
-        update_pos = min(update_pos, prob.ub)
-        @set! particle.position = update_pos
-        # @set! particle.position = min(particle.position, ub)
-
-        @set! particle.cost = prob.f(particle.position, prob.p)
-
-        if particle.cost < particle.best_cost
-            @set! particle.best_position = particle.position
-            @set! particle.best_cost = particle.cost
-        end
-
-        if particle.best_cost < gbest.cost
-            @set! gbest.position = particle.best_position
-            @set! gbest.cost = particle.best_cost
-        end
-
-        @inbounds gpu_particles[i] = particle
-
-        @inbounds gbest_ref[1] = gbest
-
-        # gpu_particles = convert(SArray, gpu_particles)
-    end
-end
-
-function pso_solve_gpu!(prob,
-        gbest,
-        gpu_particles;
-        maxiters = 100,
-        w = 0.7298f0,
-        wdamp = 1.0f0,
-        debug = false)
-
-    ## Initialize stuff
-
-    backend = get_backend(gpu_particles)
-
-    kernel = update_particle_states!(backend)
-
-    for i in 1:maxiters
-        ## Invoke GPU Kernel here
-        kernel(prob, gpu_particles, gbest, w; ndrange = length(gpu_particles))
-        w = w * wdamp
-    end
-
-    return Array(gbest)[1]
-end
diff --git a/src/pso_sync_cpu.jl b/src/pso_sync_cpu.jl
deleted file mode 100644
index db696e0..0000000
--- a/src/pso_sync_cpu.jl
+++ /dev/null
@@ -1,43 +0,0 @@
-function update_particles(prob, gpu_particles, gbest_ref, w; c1 = 1.4962f0,
-        c2 = 1.4962f0)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    i > length(gpu_particles) && return
-
-    # i = 1
-
-    ## Access the particle
-
-    @inbounds gbest = gbest_ref[1]
-
-    # gpu_particles = convert(MArray, gpu_particles)
-
-    @inbounds particle = gpu_particles[i]
-    ## Update velocity
-
-    updated_velocity = w .* particle.velocity .+
-                       c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
-                        particle.position) .+
-                       c2 .* rand(typeof(particle.velocity)) .*
-                       (gbest.position - particle.position)
-
-    @set! particle.velocity = updated_velocity
-
-    @set! particle.position = particle.position + particle.velocity
-
-    update_pos = max(particle.position, prob.lb)
-    update_pos = min(update_pos, prob.ub)
-    @set! particle.position = update_pos
-    # @set! particle.position = min(particle.position, ub)
-    if particle.cost < particle.best_cost
-        @set! particle.best_position = particle.position
-        @set! particle.best_cost = particle.cost
-    end
-
-    @inbounds gpu_particles[i] = particle
-
-    return nothing
-end
-
-gbest = minimum(gpu_particles)
-
-@set! particle.cost = prob.f(particle.position, prob.p)
diff --git a/src/pso_sync_gpu.jl b/src/pso_sync_gpu.jl
deleted file mode 100644
index d941c17..0000000
--- a/src/pso_sync_gpu.jl
+++ /dev/null
@@ -1,59 +0,0 @@
-@kernel function _update_particle_states!(prob, gpu_particles, gbest, w; c1 = 1.4962f0,
-        c2 = 1.4962f0)
-    i = @index(Global, Linear)
-    if i <= length(gpu_particles)
-        # gpu_particles = convert(MArray, gpu_particles)
-
-        @inbounds particle = gpu_particles[i]
-        ## Update velocity
-
-        updated_velocity = w .* particle.velocity .+
-                           c1 .* rand(typeof(particle.velocity)) .*
-                           (particle.best_position -
-                            particle.position) .+
-                           c2 .* rand(typeof(particle.velocity)) .*
-                           (gbest.position - particle.position)
-
-        @set! particle.velocity = updated_velocity
-
-        @set! particle.position = particle.position + particle.velocity
-
-        update_pos = max(particle.position, prob.lb)
-        update_pos = min(update_pos, prob.ub)
-        @set! particle.position = update_pos
-
-        @set! particle.cost = prob.f(particle.position, prob.p)
-
-        if particle.cost < particle.best_cost
-            @set! particle.best_position = particle.position
-            @set! particle.best_cost = particle.cost
-        end
-
-        @inbounds gpu_particles[i] = particle
-    end
-end
-
-function pso_solve_sync_gpu!(prob,
-        gbest,
-        gpu_particles;
-        maxiters = 100,
-        w = 0.7298f0,
-        wdamp = 1.0f0,
-        debug = false)
-    backend = get_backend(gpu_particles)
-
-    update_particle_kernel = _update_particle_states!(backend)
-
-    for i in 1:maxiters
-        update_particle_kernel(prob,
-            gpu_particles,
-            gbest,
-            w;
-            ndrange = length(gpu_particles))
-        best_particle = minimum(gpu_particles)
-        gbest = PSOGBest(best_particle.position, best_particle.best_cost)
-        w = w * wdamp
-    end
-
-    return gbest
-end
diff --git a/src/solve.jl b/src/solve.jl
new file mode 100644
index 0000000..9854ee9
--- /dev/null
+++ b/src/solve.jl
@@ -0,0 +1,65 @@
+function SciMLBase.__solve(prob::OptimizationProblem, opt::PSOAlogrithm, args...; kwargs...)
+    lb, ub = check_init_bounds(prob)
+    prob = remake(prob; lb = lb, ub = ub)
+
+    gbest = pso_solve(prob, opt, args...; kwargs...)
+
+    SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
+        gbest.position, gbest.cost)
+end
+
+function pso_solve(prob::OptimizationProblem,
+        opt::ParallelPSOKernel,
+        args...;
+        kwargs...)
+    backend = opt.backend
+    @assert prob.u0 isa SArray
+    init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0))
+    # TODO: Do the equivalent of cu()/roc()
+    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
+    gpu_particles = KernelAbstractions.allocate(backend,
+        particles_eltype,
+        size(particles))
+    copyto!(gpu_particles, particles)
+    gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
+    copyto!(gpu_init_gbest, [init_gbest])
+
+    gbest = vectorized_solve!(prob,
+        gpu_init_gbest,
+        gpu_particles,
+        opt,
+        Val(opt.global_update),
+        args...;
+        kwargs...)
+    gbest
+end
+
+function pso_solve(prob::OptimizationProblem,
+        opt::ParallelPSOArray,
+        args...;
+        kwargs...)
+    init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0))
+    gbest = vectorized_solve!(prob, init_gbest, particles, opt, args...; kwargs...)
+    gbest
+end
+
+function pso_solve(prob::OptimizationProblem, opt::SerialPSO, args...; kwargs...)
+    init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0))
+    gbest = vectorized_solve!(prob, init_gbest, particles, opt; kwargs...)
+    gbest
+end
+
+function pso_solve(prob::OptimizationProblem,
+        opt::ParallelSyncPSOKernel,
+        args...;
+        kwargs...)
+    backend = opt.backend
+    init_gbest, particles = init_particles(prob, opt.num_particles, typeof(prob.u0))
+    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
+    gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
+    copyto!(gpu_particles, particles)
+    init_gbest = init_gbest
+
+    gbest = vectorized_solve!(prob, init_gbest, gpu_particles, opt; kwargs...)
+    gbest
+end
diff --git a/src/utils.jl b/src/utils.jl
index c7a0d12..0f9d1d2 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -6,7 +6,7 @@ function uniform(dim::Int, lb::AbstractArray{T}, ub::AbstractArray{T}) where {T}
     return arr
 end
 
-function init_particles(prob, n_particles)
+function init_particles(prob, n_particles, ::Type{T}) where {T <: SArray}
     dim = length(prob.u0)
     lb = prob.lb
     ub = prob.ub
@@ -28,7 +28,7 @@ function init_particles(prob, n_particles)
 
     gbest_position = SVector{length(gbest_position), eltype(gbest_position)}(gbest_position)
     gbest_cost = cost_func(gbest_position, p)
-    particles = PSOParticle[]
+    particles = SPSOParticle[]
     for i in 1:n_particles
         if lb === nothing || (all(isinf, lb) && all(isinf, ub))
             position = Array{eltype(prob.u0), 1}(undef, dim)
@@ -45,20 +45,20 @@ function init_particles(prob, n_particles)
         position = SVector{length(position), eltype(position)}(position)
         velocity = @SArray zeros(eltype(position), dim)
         cost = cost_func(position, p)
-        best_position = copy(position)
-        best_cost = copy(cost)
-        push!(particles, PSOParticle(position, velocity, cost, best_position, best_cost))
+        best_position = position
+        best_cost = cost
+        push!(particles, SPSOParticle(position, velocity, cost, best_position, best_cost))
 
         if best_cost < gbest_cost
-            gbest_position = copy(best_position)
-            gbest_cost = copy(best_cost)
+            gbest_position = best_position
+            gbest_cost = best_cost
         end
     end
-    gbest = PSOGBest(gbest_position, gbest_cost)
+    gbest = SPSOGBest(gbest_position, gbest_cost)
     return gbest, convert(Vector{typeof(particles[1])}, particles)
 end
 
-function init_particles(prob, population, ::CPU)
+function init_particles(prob, n_particles, ::Type{T}) where {T <: AbstractArray}
     dim = length(prob.u0)
     lb = prob.lb
     ub = prob.ub
@@ -76,11 +76,10 @@ function init_particles(prob, population, ::CPU)
     else
         gbest_position = uniform(dim, lb, ub)
     end
+    gbest = MPSOGBest(gbest_position, cost_func(gbest_position, prob.p))
 
-    gbest = Gbest(gbest_position, cost_func(gbest_position, data_dict))
-
-    particles = Particle[]
-    for i in 1:population
+    particles = MPSOParticle[]
+    for i in 1:n_particles
         if lb === nothing || (all(isinf, lb) && all(isinf, ub))
             position = Array{eltype(prob.u0), 1}(undef, dim)
             for i in 1:dim
@@ -93,16 +92,26 @@ function init_particles(prob, population, ::CPU)
         else
             position = uniform(dim, lb, ub)
         end
-        velocity = zeros(dim)
-        cost = cost_func(position, data_dict)
+        velocity = zeros(eltype(position), dim)
+        cost = cost_func(position, prob.p)
         best_position = copy(position)
         best_cost = copy(cost)
-        push!(particles, Particle(position, velocity, cost, best_position, best_cost))
+        push!(particles, MPSOParticle(position, velocity, cost, best_position, best_cost))
 
         if best_cost < gbest.cost
             gbest.position = copy(best_position)
             gbest.cost = copy(best_cost)
         end
     end
-    return gbest, particles
+    return gbest, convert(Vector{typeof(particles[1])}, particles)
+end
+
+function check_init_bounds(prob)
+    lb = prob.lb === nothing ? fill(eltype(prob.u0)(-Inf), length(prob.u0)) : prob.lb
+    ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
+    if prob.u0 isa SArray
+        lb = SVector{length(lb), eltype(lb)}(lb)
+        ub = SVector{length(ub), eltype(ub)}(ub)
+    end
+    lb, ub
 end
diff --git a/test/Project.toml b/test/Project.toml
index dadaeac..7b206a3 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,8 @@
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/gpu.jl b/test/gpu.jl
index fe97f68..f4aa32c 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -1,37 +1,47 @@
-Random.seed!(1234)
+using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
 
-## Solving the rosenbrock problem
-lb = @SArray ones(Float32, N)
-lb = -1 * lb
-ub = @SArray fill(Float32(10.0), N)
+DEVICE = get(ENV, "GROUP", "CUDA")
 
-function rosenbrock(x, p)
-    sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
+@eval using $(Symbol(DEVICE))
+
+if DEVICE == "CUDA"
+    backend = CUDABackend()
+elseif DEVICE == "AMDGPU"
+    backend = ROCBackend()
 end
 
-x0 = @SArray zeros(Float32, N)
-p = @SArray Float32[1.0, 100.0]
+@testset "Rosenbrock GPU tests $(N)" for N in 2:4
+    Random.seed!(1234)
+
+    ## Solving the rosenbrock problem
+    lb = @SArray ones(Float32, N)
+    lb = -1 * lb
+    ub = @SArray fill(Float32(10.0), N)
 
-prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+    function rosenbrock(x, p)
+        sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
+    end
 
-n_particles = 1000
+    x0 = @SArray zeros(Float32, N)
+    p = @SArray Float32[1.0, 100.0]
 
-sol = solve(prob, ParallelPSOKernel(n_particles; backend), maxiters = 500)
+    prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
 
-@test sol.objective < 1e-4
+    n_particles = 100
 
-sol = solve(prob,
-    ParallelSyncPSO(n_particles, backend),
-    maxiters = 500)
+    sol = solve(prob, ParallelPSOKernel(n_particles; backend), maxiters = 500)
 
-@test sol.objective < 1e-4
+    @test sol.retcode == ReturnCode.Default
 
-prob = OptimizationProblem(rosenbrock, x0, p)
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend, global_update = false),
+        maxiters = 500)
 
-n_particles = 2000
+    @test sol.retcode == ReturnCode.Default
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = true),
-    maxiters = 500)
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles, backend),
+        maxiters = 500)
 
-@test sol.objective < 1e-4
+    @test sol.objective < 6e-4
+end
diff --git a/test/regression.jl b/test/regression.jl
index 17bc486..9a5b2af 100644
--- a/test/regression.jl
+++ b/test/regression.jl
@@ -1,63 +1,76 @@
-## Solving the rosenbrock problem
-Random.seed!(1)
+using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
 
-lb = @SArray ones(Float32, N)
-lb = -1 * lb
-ub = @SArray fill(Float32(10.0), N)
+@testset "Rosenbrock test dimension = $(N)" for N in 2:4
 
-function rosenbrock(x, p)
-    sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
-end
+    ## Solving the rosenbrock problem
+    Random.seed!(1234)
+    lb = @SArray ones(Float32, N)
+    lb = -1 * lb
+    ub = @SArray fill(Float32(10.0), N)
+
+    function rosenbrock(x, p)
+        sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
+    end
+
+    x0 = @SArray zeros(Float32, N)
+    p = @SArray Float32[1.0, 100.0]
 
-x0 = @SArray zeros(Float32, N)
-p = @SArray Float32[1.0, 100.0]
+    array_prob = OptimizationProblem(rosenbrock,
+        zeros(Float32, N),
+        Float32[1.0, 100.0];
+        lb = lb,
+        ub = ub)
 
-prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
+    prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
 
-n_particles = 1000
+    n_particles = 1000
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = true),
-    maxiters = 500)
+    sol = solve(array_prob,
+        ParallelPSOArray(n_particles),
+        maxiters = 500)
 
-@test sol.objective < 1e-4
+    @test sol.objective < 3e-4
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = false),
-    maxiters = 500)
+    sol = solve(prob,
+        SerialPSO(n_particles),
+        maxiters = 600)
 
-@test sol.objective < 1e-4
+    @test sol.objective < 1e-4
 
-lb = @SVector fill(Float32(-Inf), N)
-ub = @SVector fill(Float32(Inf), N)
-prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub, N)
+    lb = @SVector fill(Float32(-Inf), N)
+    ub = @SVector fill(Float32(Inf), N)
 
-n_particles = 2000
+    array_prob = remake(array_prob; lb = lb, ub = ub)
+    prob = remake(prob; lb = lb, ub = ub)
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = true),
-    maxiters = 500)
+    n_particles = 2000
 
-@test sol.objective < 1e-4
+    sol = solve(array_prob,
+        ParallelPSOArray(n_particles),
+        maxiters = 500)
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = false),
-    maxiters = 500)
+    @test sol.objective < 1e-4
 
-@test sol.objective < 1e-4
+    sol = solve(prob,
+        SerialPSO(n_particles),
+        maxiters = 500)
 
-prob = OptimizationProblem(rosenbrock, x0, p)
+    @test sol.objective < 1e-4
 
-n_particles = 2000
+    array_prob = remake(array_prob; lb = nothing, ub = nothing)
+    prob = remake(prob; lb = nothing, ub = nothing)
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = true),
-    maxiters = 500)
+    n_particles = 2000
 
-@test sol.objective < 1e-4
+    sol = solve(array_prob,
+        ParallelPSOArray(n_particles),
+        maxiters = 500)
 
-sol = solve(prob,
-    ParallelPSOKernel(n_particles; threaded = false),
-    maxiters = 500)
+    @test sol.objective < 1e-4
 
-@test sol.objective < 1e-4
+    sol = solve(prob,
+        SerialPSO(n_particles),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9c840e7..b45e416 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,22 +1,10 @@
-using PSOGPU
-using Test, StaticArrays, LinearAlgebra, Random
+using SafeTestsets
+using Test
 
 const GROUP = get(ENV, "GROUP", "CPU")
 
-@testset "Rosenbrock test dimension = $(n)" for n in 2:4
-    global N = n
-    include("./regression.jl")
-end
+@safetestset "Regression tests" include("./regression.jl")
 
 if GROUP != "CPU"
-    @eval using $(Symbol(GROUP))
-    if GROUP == "CUDA"
-        backend = CUDABackend()
-    elseif GROUP == "AMDGPU"
-        backend = ROCBackend()
-    end
-
-    @testset "Rosenbrock on gpu" begin
-        include("./gpu.jl")
-    end
+    @safetestset "GPU optimizers tests" include("./gpu.jl")
 end