Merge pull request #43 from SciML/u/queuelockfix

utkarsh530 · web-flow · commit 20976b3ae1d1 · 2024-01-20T14:50:48.000-05:00
Update solvers to run CPUs as well
diff --git a/benchmarks/CPU_vs_GPU/benchmark.jl b/benchmarks/CPU_vs_GPU/benchmark.jl
@@ -3,7 +3,7 @@ using CUDA
 
 device!(2)
 
-N = 10
+N = 3
 function rosenbrock(x, p)
     sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
 end
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -48,19 +48,18 @@ end
     particle = @private SPSOParticle{T1, T2} 1
 
     @inbounds particle[1] = gpu_particles[i]
-    @inbounds gbest = gbest_ref[1]
-
     # Initialize cost to be Inf
     if tidx == 1
-        fill!(best_queue, SPSOGBest(gbest.position, convert(typeof(gbest.cost), Inf)))
+        fill!(best_queue,
+            SPSOGBest(particle[1].position, convert(typeof(particle[1].cost), Inf)))
         queue_num[1] = UInt32(0)
     end
 
     @synchronize
 
     @inbounds particle[1] = update_particle_state(particle[1],
         prob,
-        gbest,
+        gbest_ref[1],
         w,
         c1,
         c2,
@@ -69,8 +68,7 @@ end
 
     @synchronize
 
-    gbest = @inbounds gbest_ref[1]
-    if particle[1].best_cost < gbest.cost
+    @inbounds if particle[1].best_cost < gbest_ref[1].cost
         queue_idx = @atomic queue_num[1] += UInt32(1)
         @inbounds best_queue[queue_idx] = SPSOGBest(particle[1].best_position,
             particle[1].best_cost)
@@ -96,8 +94,7 @@ end
             end
 
             # Update global best fit
-            gbest = @inbounds gbest_ref[1]
-            @inbounds if best_queue[1].cost < gbest.cost
+            @inbounds if best_queue[1].cost < gbest_ref[1].cost
                 gbest_ref[1] = best_queue[1]
             end
 
@@ -153,6 +150,21 @@ end
     @inbounds gpu_particles[i] = particle
 end
 
+# Why you say we need a different code for CPUs for sync version? Turns out
+# that you cannot do reduction within a kernel due to some bugs in KA.jl
+# https://github.com/JuliaGPU/KernelAbstractions.jl/issues/330
+@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
+        opt::ParallelSyncPSOKernel{Backend, T, G, H}; c1 = 1.4962f0,
+        c2 = 1.4962f0) where {Backend <: CPU, T, G, H}
+    i = @index(Global, Linear)
+
+    @inbounds particle = gpu_particles[i]
+
+    particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
+
+    @inbounds gpu_particles[i] = particle
+end
+
 @kernel function update_particle_states_async!(prob,
         gpu_particles,
         gbest_ref,
diff --git a/src/lowerlevel_solve.jl b/src/lowerlevel_solve.jl
@@ -28,6 +28,31 @@ function vectorized_solve!(prob,
     return gbest, gpu_particles
 end
 
+function vectorized_solve!(prob,
+        gbest,
+        gpu_particles, opt::ParallelSyncPSOKernel{Backend, T, G, H};
+        maxiters = 100,
+        w = 0.7298f0,
+        wdamp = 1.0f0,
+        debug = false) where {Backend <: CPU, T, G, H}
+    backend = get_backend(gpu_particles)
+
+    update_particle_kernel = update_particle_states!(backend)
+
+    for i in 1:maxiters
+        update_particle_kernel(prob,
+            gpu_particles,
+            gbest,
+            w, opt;
+            ndrange = length(gpu_particles))
+        best_particle = minimum(gpu_particles)
+        gbest = SPSOGBest(best_particle.position, best_particle.best_cost)
+        w = w * wdamp
+    end
+
+    return gbest, gpu_particles
+end
+
 function vectorized_solve!(prob,
         gbest,
         gpu_particles, opt::ParallelPSOKernel, ::Val{true};
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/test/regression.jl b/test/regression.jl
@@ -1,4 +1,4 @@
-using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
+using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random, KernelAbstractions
 
 @testset "Rosenbrock test dimension = $(N)" for N in 2:3
 
@@ -36,6 +36,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
 
     @test sol.objective < 1e-4
 
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+
     lb = @SVector fill(Float32(-Inf), N)
     ub = @SVector fill(Float32(Inf), N)
 
@@ -54,6 +66,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
 
     @test sol.objective < 1e-4
 
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+
     array_prob = remake(array_prob; lb = nothing, ub = nothing)
     prob = remake(prob; lb = nothing, ub = nothing)
 
@@ -68,6 +92,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
         maxiters = 500)
 
     @test sol.objective < 1e-4
+
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
+
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles; backend = CPU()),
+        maxiters = 500)
+
+    @test sol.objective < 1e-4
 end
 
 ## Separate tests for N = 4 as the problem becomes non-convex and requires more iterations to converge
@@ -102,6 +138,18 @@ end
 
     @test sol.objective < 2e-3
 
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend = CPU()),
+        maxiters = 1000)
+
+    @test sol.objective < 2e-3
+
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles; backend = CPU()),
+        maxiters = 1000)
+
+    @test sol.objective < 2e-3
+
     lb = @SVector fill(Float32(-Inf), N)
     ub = @SVector fill(Float32(Inf), N)
 
@@ -122,4 +170,16 @@ end
         maxiters = 1000)
 
     @test sol.objective < 2e-3
+
+    sol = solve(prob,
+        ParallelPSOKernel(n_particles; backend = CPU()),
+        maxiters = 1000)
+
+    @test sol.objective < 2e-3
+
+    sol = solve(prob,
+        ParallelSyncPSOKernel(n_particles; backend = CPU()),
+        maxiters = 1000)
+
+    @test sol.objective < 2e-3
 end