Skip to content

Commit 9307b5c

Browse files
authored
Merge pull request #37 from SciML/jps/locking
ParallelPSOKernel: Do proper locking
2 parents f07ee56 + e545e1d commit 9307b5c

File tree

4 files changed

+65
-15
lines changed

4 files changed

+65
-15
lines changed

src/PSOGPU.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ using SciMLBase, StaticArrays, Setfield, KernelAbstractions
44
using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff
55
import Adapt
66
import Enzyme: autodiff_deferred, Active, Reverse
7+
import KernelAbstractions: @atomic, @atomicreplace, @atomicswap
8+
using QuasiMonteCarlo
9+
710
import DiffEqGPU: GPUTsit5, vectorized_asolve, make_prob_compatible
811

912
## Use lb and ub either as StaticArray or pass them separately as CuArrays

src/kernels.jl

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,25 +33,71 @@ end
3333
particle
3434
end
3535

36-
@kernel function update_particle_states!(prob, gpu_particles, gbest_ref, w,
37-
opt::ParallelPSOKernel; c1 = 1.4962f0,
38-
c2 = 1.4962f0)
36+
@kernel function update_particle_states!(prob,
37+
gpu_particles::AbstractArray{SPSOParticle{T1, T2}}, gbest_ref, w,
38+
opt::ParallelPSOKernel, lock; c1 = 1.4962f0,
39+
c2 = 1.4962f0) where {T1, T2}
3940
i = @index(Global, Linear)
41+
tidx = @index(Local, Linear)
4042

41-
@inbounds gbest = gbest_ref[1]
42-
@inbounds particle = gpu_particles[i]
43+
@uniform gs = @groupsize()[1]
4344

44-
particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
45+
best_queue = @localmem SPSOGBest{T1, T2} (gs)
46+
queue_num = @localmem UInt32 1
47+
48+
particle = @private SPSOParticle{T1, T2} 1
49+
50+
# Initialize cost to be Inf
51+
@inbounds particle[1] = gpu_particles[i]
52+
best_queue[tidx] = SPSOGBest(particle[1].best_position,
53+
convert(T2, Inf))
54+
queue_num[1] = UInt32(0)
55+
56+
@synchronize
57+
58+
@inbounds particle[1] = gpu_particles[i]
59+
gbest = @inbounds gbest_ref[1]
60+
@inbounds particle[1] = update_particle_state(particle[1], prob, gbest, w, c1, c2, i, opt)
61+
62+
@synchronize
4563

46-
## NOTE: This causes thread races to update global best particle.
47-
if particle.best_cost < gbest.cost
48-
@set! gbest.position = particle.best_position
49-
@set! gbest.cost = particle.best_cost
64+
@inbounds particle[1] = gpu_particles[i]
65+
gbest = @inbounds gbest_ref[1]
66+
if particle[1].best_cost < gbest.cost
67+
queue_idx = @atomic queue_num[1] += UInt32(1)
68+
@inbounds best_queue[queue_idx] = SPSOGBest(particle[1].best_position,
69+
particle[1].best_cost)
5070
end
5171

52-
@inbounds gbest_ref[1] = gbest
72+
@synchronize
5373

54-
@inbounds gpu_particles[i] = particle
74+
if tidx == 1
75+
if queue_num[1] > 1
76+
# Find best fit in block
77+
for j in 2:queue_num[1]
78+
@inbounds if best_queue[j].cost < best_queue[1].cost
79+
best_queue[1] = best_queue[j]
80+
end
81+
end
82+
83+
# Take lock
84+
while true
85+
res = @atomicreplace lock[1] UInt32(0)=>UInt32(1)
86+
if res.success
87+
break
88+
end
89+
end
90+
91+
# Update global best fit
92+
gbest = @inbounds gbest_ref[1]
93+
@inbounds if best_queue[1].cost < gbest.cost
94+
gbest_ref[1] = best_queue[1]
95+
end
96+
97+
# Release lock
98+
@atomicswap lock[1] = 0
99+
end
100+
end
55101
end
56102

57103
@kernel function update_particle_states!(prob,

src/lowerlevel_solve.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@ function vectorized_solve!(prob,
4040

4141
backend = get_backend(gpu_particles)
4242

43-
kernel = update_particle_states!(backend)
43+
kernel = update_particle_states!(backend, 1024)
4444

45+
lock = KernelAbstractions.allocate(backend, UInt32, 1)
46+
fill!(lock, UInt32(0))
4547
for i in 1:maxiters
4648
## Invoke GPU Kernel here
47-
kernel(prob, gpu_particles, gbest, w, opt; ndrange = length(gpu_particles))
49+
kernel(prob, gpu_particles, gbest, w, opt, lock; ndrange = length(gpu_particles))
4850
w = w * wdamp
4951
end
5052

test/Project.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
[deps]
2-
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
32
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
43
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
54
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

0 commit comments

Comments
 (0)