Merge pull request #37 from SciML/jps/locking

utkarsh530 · web-flow · commit 9307b5c12713 · 2024-01-19T12:02:59.000-05:00
ParallelPSOKernel: Do proper locking
diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl
@@ -4,6 +4,9 @@ using SciMLBase, StaticArrays, Setfield, KernelAbstractions
 using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff
 import Adapt
 import Enzyme: autodiff_deferred, Active, Reverse
+import KernelAbstractions: @atomic, @atomicreplace, @atomicswap
+using QuasiMonteCarlo
+
 import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
 
 ## Use lb and ub either as StaticArray or pass them separately as CuArrays
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -33,25 +33,71 @@ end
     particle
 end
 
-@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
-        opt::ParallelPSOKernel; c1 = 1.4962f0,
-        c2 = 1.4962f0)
+@kernel function update_particle_states!(prob,
+        gpu_particles::AbstractArray{SPSOParticle{T1, T2}}, gbest_ref, w,
+        opt::ParallelPSOKernel, lock; c1 = 1.4962f0,
+        c2 = 1.4962f0) where {T1, T2}
     i = @index(Global, Linear)
+    tidx = @index(Local, Linear)
 
-    @inbounds gbest = gbest_ref[1]
-    @inbounds particle = gpu_particles[i]
+    @uniform gs = @groupsize()[1]
 
-    particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
+    best_queue = @localmem SPSOGBest{T1, T2} (gs)
+    queue_num = @localmem UInt32 1
+
+    particle = @private SPSOParticle{T1, T2} 1
+
+    # Initialize cost to be Inf
+    @inbounds particle[1] = gpu_particles[i]
+    best_queue[tidx] = SPSOGBest(particle[1].best_position,
+                                 convert(T2, Inf))
+    queue_num[1] = UInt32(0)
+
+    @synchronize
+
+    @inbounds particle[1] = gpu_particles[i]
+    gbest = @inbounds gbest_ref[1]
+    @inbounds particle[1] = update_particle_state(particle[1], prob, gbest, w, c1, c2, i, opt)
+
+    @synchronize
 
-    ## NOTE: This causes thread races to update global best particle.
-    if particle.best_cost < gbest.cost
-        @set! gbest.position = particle.best_position
-        @set! gbest.cost = particle.best_cost
+    @inbounds particle[1] = gpu_particles[i]
+    gbest = @inbounds gbest_ref[1]
+    if particle[1].best_cost < gbest.cost
+        queue_idx = @atomic queue_num[1] += UInt32(1)
+        @inbounds best_queue[queue_idx] = SPSOGBest(particle[1].best_position,
+                                                    particle[1].best_cost)
     end
 
-    @inbounds gbest_ref[1] = gbest
+    @synchronize
 
-    @inbounds gpu_particles[i] = particle
+    if tidx == 1
+        if queue_num[1] > 1
+            # Find best fit in block
+            for j in 2:queue_num[1]
+                @inbounds if best_queue[j].cost < best_queue[1].cost
+                    best_queue[1] = best_queue[j]
+                end
+            end
+
+            # Take lock
+            while true
+                res = @atomicreplace lock[1] UInt32(0)=>UInt32(1)
+                if res.success
+                    break
+                end
+            end
+
+            # Update global best fit
+            gbest = @inbounds gbest_ref[1]
+            @inbounds if best_queue[1].cost < gbest.cost
+                gbest_ref[1] = best_queue[1]
+            end
+
+            # Release lock
+            @atomicswap lock[1] = 0
+        end
+    end
 end
 
 @kernel function update_particle_states!(prob,
diff --git a/src/lowerlevel_solve.jl b/src/lowerlevel_solve.jl
@@ -40,11 +40,13 @@ function vectorized_solve!(prob,
 
     backend = get_backend(gpu_particles)
 
-    kernel = update_particle_states!(backend)
+    kernel = update_particle_states!(backend, 1024)
 
+    lock = KernelAbstractions.allocate(backend, UInt32, 1)
+    fill!(lock, UInt32(0))
     for i in 1:maxiters
         ## Invoke GPU Kernel here
-        kernel(prob, gpu_particles, gbest, w, opt; ndrange = length(gpu_particles))
+        kernel(prob, gpu_particles, gbest, w, opt, lock; ndrange = length(gpu_particles))
         w = w * wdamp
     end
 
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,4 @@
 [deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`[deps]`
`2`		`-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"`
`3`	`2`	`LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"`
`4`	`3`	`Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"`
`5`	`4`	`Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"`