update algorithms to use backend

utkarsh530 · utkarsh530 · commit 14698414db2c · 2023-11-28T00:10:03.000-05:00
diff --git a/examples/neural_network/nn.jl b/examples/neural_network/nn.jl
@@ -2,7 +2,7 @@ using SimpleChains
 using IterTools
 using MLDatasets
 using Random
-dataset =  MLDatasets.Iris().dataframe
+dataset = MLDatasets.Iris().dataframe
 
 data = Array(dataset)
 data = data[shuffle(1:end), :]
@@ -17,11 +17,9 @@ function mapstrtoclass(flower)
     end
 end
 ytrain = map(mapstrtoclass, data[:, 5])
-lenet = SimpleChain(
-    static(4),
+lenet = SimpleChain(static(4),
     TurboDense{true}(tanh, 20),
-    TurboDense{true}(identity, 3),
-)
+    TurboDense{true}(identity, 3))
 lenetloss = SimpleChains.add_loss(lenet, LogitCrossEntropyLoss(ytrain))
 
 p = SimpleChains.init_params(lenet);
@@ -30,18 +28,16 @@ G = SimpleChains.alloc_threaded_grad(lenet);
 
 lenetloss(xtrain, p)
 
-report = let mlpdloss = lenetloss, X=xtrain
-p -> begin
-      let train = mlpdloss(X, p)
-        @info "Loss:" train
-      end
+report = let mlpdloss = lenetloss, X = xtrain
+    p -> begin
+        let train = mlpdloss(X, p)
+            @info "Loss:" train
+        end
     end
 end
 
 for _ in 1:3
-    @time SimpleChains.train_unbatched!(
-      G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000
-    );
+    @time SimpleChains.train_unbatched!(G, p, lenetloss, xtrain, SimpleChains.ADAM(), 5000)
     report(p)
 end
 
@@ -53,10 +49,10 @@ using Optimization, PSOGPU
 
 lb = -ones(length(p)) .* 10
 ub = ones(length(p)) .* 10
-prob = OptimizationProblem((u,data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
+prob = OptimizationProblem((u, data) -> lenetloss(data, u), p, xtrain; lb = lb, ub = ub)
 
 n_particles = 1000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
-    maxiters = 1000)
+    ParallelPSOKernel(n_particles; threaded = true),
+    maxiters = 1000)
diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl
@@ -19,20 +19,20 @@ struct PSOGBest{T1, T2 <: eltype(T1)}
     cost::T2
 end
 
-struct ParallelPSOKernel
+struct ParallelPSOKernel{Backend}
     num_particles::Int
     async::Bool
-    gpu::Bool
     threaded::Bool
+    backend::Backend
 end
-struct ParallelSyncPSO
+struct ParallelSyncPSO{Backend}
     num_particles::Int
+    backend::Backend
 end
 
 function ParallelPSOKernel(num_particles::Int;
-        async = false,
-        gpu = false, threaded = false)
-    ParallelPSOKernel(num_particles, async, gpu, threaded)
+        async = false, threaded = false, backend = CPU())
+    ParallelPSOKernel(num_particles, async, threaded, backend)
 end
 
 SciMLBase.allowsbounds(::ParallelPSOKernel) = true
@@ -55,7 +55,8 @@ function SciMLBase.__solve(prob::OptimizationProblem,
 
     prob = remake(prob; lb = lb, ub = ub)
 
-    if !(opt.gpu)
+    ## TODO: Compare the performance of KA kernels with CPU backend with CPU implementations
+    if opt.backend isa CPU
         if opt.threaded
             gbest = PSO(prob; population = opt.num_particles, kwargs...)
         else
@@ -67,7 +68,9 @@ function SciMLBase.__solve(prob::OptimizationProblem,
         init_gbest, particles = init_particles(prob, opt.num_particles)
         # TODO: Do the equivalent of cu()/roc()
         particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
-        gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
+        gpu_particles = KernelAbstractions.allocate(backend,
+            particles_eltype,
+            size(particles))
         copyto!(gpu_particles, particles)
         gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
         copyto!(gpu_init_gbest, [init_gbest])
@@ -90,9 +93,11 @@ function SciMLBase.__solve(prob::OptimizationProblem,
     ub = prob.ub === nothing ? fill(eltype(prob.u0)(Inf), length(prob.u0)) : prob.ub
 
     prob = remake(prob; lb = lb, ub = ub)
-
+    backend = opt.backend
     init_gbest, particles = init_particles(prob, opt.num_particles)
-    gpu_particles = cu(particles)
+    particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
+    gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
+    copyto!(gpu_particles, particles)
     init_gbest = init_gbest
     gbest = pso_solve_sync_gpu!(prob, init_gbest, gpu_particles; kwargs...)
 
diff --git a/src/ode_pso.jl b/src/ode_pso.jl
@@ -5,7 +5,8 @@
         @inbounds particle = gpu_particles[i]
 
         updated_velocity = w .* particle.velocity .+
-                           c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
+                           c1 .* rand(typeof(particle.velocity)) .*
+                           (particle.best_position -
                             particle.position) .+
                            c2 .* rand(typeof(particle.velocity)) .*
                            (gbest.position - particle.position)
@@ -54,8 +55,8 @@ function parameter_estim_ode!(prob::ODEProblem,
         prob_func = default_prob_func,
         w = 0.72980f0,
         wdamp = 1.0f0,
-        maxiters = 100,
-        backend = CPU(), kwargs...)
+        maxiters = 100, kwargs...)
+    backend = get_backend(gpu_particles)
     update_states! = PSOGPU._update_particle_states!(backend)
 
     losses = KernelAbstractions.ones(backend, 1, length(gpu_particles))
@@ -69,7 +70,7 @@ function parameter_estim_ode!(prob::ODEProblem,
             ub,
             gbest,
             w;
-            ndrange=length(gpu_particles))
+            ndrange = length(gpu_particles))
 
         probs = prob_func.(Ref(improb), gpu_particles)
 
@@ -79,7 +80,7 @@ function parameter_estim_ode!(prob::ODEProblem,
 
         sum!(losses, (map(x -> sum(x .^ 2), data .- us)))
 
-        update_costs!(losses, gpu_particles; ndrange=length(losses))
+        update_costs!(losses, gpu_particles; ndrange = length(losses))
 
         best_particle = minimum(gpu_particles,
             init = PSOGPU.PSOParticle(gbest.position,
diff --git a/src/pso_async_gpu.jl b/src/pso_async_gpu.jl
@@ -55,13 +55,14 @@ function pso_solve_async_gpu!(prob,
         maxiters = 100,
         w = 0.7298f0,
         wdamp = 1.0f0,
-        debug = false,
-        backend = CPU())
+        debug = false)
 
     ## Initialize stuff
 
+    backend = get_backend(gpu_particles)
+
     kernel = update_particle_states_async!(backend)
-    kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange=length(gpu_particles))
+    kernel(prob, gpu_particles, gbest, w, wdamp, maxiters; ndrange = length(gpu_particles))
 
     best_particle = minimum(gpu_particles)
     return PSOGBest(best_particle.best_position, best_particle.best_cost)
diff --git a/src/pso_gpu.jl b/src/pso_gpu.jl
@@ -14,7 +14,8 @@
         ## Update velocity
 
         updated_velocity = w .* particle.velocity .+
-                           c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
+                           c1 .* rand(typeof(particle.velocity)) .*
+                           (particle.best_position -
                             particle.position) .+
                            c2 .* rand(typeof(particle.velocity)) .*
                            (gbest.position - particle.position)
@@ -54,16 +55,17 @@ function pso_solve_gpu!(prob,
         maxiters = 100,
         w = 0.7298f0,
         wdamp = 1.0f0,
-        debug = false,
-        backend = CPU())
+        debug = false)
 
     ## Initialize stuff
 
+    backend = get_backend(gpu_particles)
+
     kernel = update_particle_states!(backend)
 
     for i in 1:maxiters
         ## Invoke GPU Kernel here
-        kernel(prob, gpu_particles, gbest, w; ndrange=length(gpu_particles))
+        kernel(prob, gpu_particles, gbest, w; ndrange = length(gpu_particles))
         w = w * wdamp
     end
 
diff --git a/src/pso_sync_gpu.jl b/src/pso_sync_gpu.jl
@@ -8,7 +8,8 @@
         ## Update velocity
 
         updated_velocity = w .* particle.velocity .+
-                           c1 .* rand(typeof(particle.velocity)) .* (particle.best_position -
+                           c1 .* rand(typeof(particle.velocity)) .*
+                           (particle.best_position -
                             particle.position) .+
                            c2 .* rand(typeof(particle.velocity)) .*
                            (gbest.position - particle.position)
@@ -38,14 +39,17 @@ function pso_solve_sync_gpu!(prob,
         maxiters = 100,
         w = 0.7298f0,
         wdamp = 1.0f0,
-        debug = false,
-        backend = CPU())
-    @show minimum(gpu_particles)
+        debug = false)
+    backend = get_backend(gpu_particles)
 
     update_particle_kernel = _update_particle_states!(backend)
 
     for i in 1:maxiters
-        update_particle_kernel(prob, gpu_particles, gbest, w; ndrange=length(gpu_particles))
+        update_particle_kernel(prob,
+            gpu_particles,
+            gbest,
+            w;
+            ndrange = length(gpu_particles))
         best_particle = minimum(gpu_particles)
         gbest = PSOGBest(best_particle.position, best_particle.best_cost)
         w = w * wdamp
diff --git a/test/gpu.jl b/test/gpu.jl
@@ -16,7 +16,7 @@ prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
 
 n_particles = 1000
 
-sol = solve(prob, ParallelPSOKernel(n_particles; gpu = true), maxiters = 500)
+sol = solve(prob, ParallelPSOKernel(n_particles; backend = CUDABackend()), maxiters = 500)
 
 @test sol.objective < 1e-4
 
@@ -25,7 +25,7 @@ prob = OptimizationProblem(rosenbrock, x0, p)
 n_particles = 2000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
+    ParallelPSOKernel(n_particles; threaded = true),
     maxiters = 500)
 
 @test sol.objective < 1e-4
diff --git a/test/regression.jl b/test/regression.jl
@@ -17,13 +17,13 @@ prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub)
 n_particles = 1000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
+    ParallelPSOKernel(n_particles; threaded = true),
     maxiters = 500)
 
 @test sol.objective < 1e-4
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = false),
+    ParallelPSOKernel(n_particles; threaded = false),
     maxiters = 500)
 
 @test sol.objective < 1e-4
@@ -35,13 +35,13 @@ prob = OptimizationProblem(rosenbrock, x0, p; lb = lb, ub = ub, N)
 n_particles = 2000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
+    ParallelPSOKernel(n_particles; threaded = true),
     maxiters = 500)
 
 @test sol.objective < 1e-4
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = false),
+    ParallelPSOKernel(n_particles; threaded = false),
     maxiters = 500)
 
 @test sol.objective < 1e-4
@@ -51,13 +51,13 @@ prob = OptimizationProblem(rosenbrock, x0, p)
 n_particles = 2000
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = true),
+    ParallelPSOKernel(n_particles; threaded = true),
     maxiters = 500)
 
 @test sol.objective < 1e-4
 
 sol = solve(prob,
-    ParallelPSOKernel(n_particles; gpu = false, threaded = false),
+    ParallelPSOKernel(n_particles; threaded = false),
     maxiters = 500)
 
 @test sol.objective < 1e-4