Skip to content

Commit 20976b3

Browse files
authored
Merge pull request #43 from SciML/u/queuelockfix
Update solvers to run CPUs as well
2 parents ec34060 + e1e6e63 commit 20976b3

File tree

5 files changed

+108
-10
lines changed

5 files changed

+108
-10
lines changed

benchmarks/CPU_vs_GPU/benchmark.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ using CUDA
33

44
device!(2)
55

6-
N = 10
6+
N = 3
77
function rosenbrock(x, p)
88
sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1))
99
end

src/kernels.jl

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,18 @@ end
4848
particle = @private SPSOParticle{T1, T2} 1
4949

5050
@inbounds particle[1] = gpu_particles[i]
51-
@inbounds gbest = gbest_ref[1]
52-
5351
# Initialize cost to be Inf
5452
if tidx == 1
55-
fill!(best_queue, SPSOGBest(gbest.position, convert(typeof(gbest.cost), Inf)))
53+
fill!(best_queue,
54+
SPSOGBest(particle[1].position, convert(typeof(particle[1].cost), Inf)))
5655
queue_num[1] = UInt32(0)
5756
end
5857

5958
@synchronize
6059

6160
@inbounds particle[1] = update_particle_state(particle[1],
6261
prob,
63-
gbest,
62+
gbest_ref[1],
6463
w,
6564
c1,
6665
c2,
@@ -69,8 +68,7 @@ end
6968

7069
@synchronize
7170

72-
gbest = @inbounds gbest_ref[1]
73-
if particle[1].best_cost < gbest.cost
71+
@inbounds if particle[1].best_cost < gbest_ref[1].cost
7472
queue_idx = @atomic queue_num[1] += UInt32(1)
7573
@inbounds best_queue[queue_idx] = SPSOGBest(particle[1].best_position,
7674
particle[1].best_cost)
@@ -96,8 +94,7 @@ end
9694
end
9795

9896
# Update global best fit
99-
gbest = @inbounds gbest_ref[1]
100-
@inbounds if best_queue[1].cost < gbest.cost
97+
@inbounds if best_queue[1].cost < gbest_ref[1].cost
10198
gbest_ref[1] = best_queue[1]
10299
end
103100

@@ -153,6 +150,21 @@ end
153150
@inbounds gpu_particles[i] = particle
154151
end
155152

153+
# Why you say we need a different code for CPUs for sync version? Turns out
154+
# that you cannot do reduction within a kernel due to some bugs in KA.jl
155+
# https://github.com/JuliaGPU/KernelAbstractions.jl/issues/330
156+
@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
157+
opt::ParallelSyncPSOKernel{Backend, T, G, H}; c1 = 1.4962f0,
158+
c2 = 1.4962f0) where {Backend <: CPU, T, G, H}
159+
i = @index(Global, Linear)
160+
161+
@inbounds particle = gpu_particles[i]
162+
163+
particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
164+
165+
@inbounds gpu_particles[i] = particle
166+
end
167+
156168
@kernel function update_particle_states_async!(prob,
157169
gpu_particles,
158170
gbest_ref,

src/lowerlevel_solve.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,31 @@ function vectorized_solve!(prob,
2828
return gbest, gpu_particles
2929
end
3030

31+
function vectorized_solve!(prob,
32+
gbest,
33+
gpu_particles, opt::ParallelSyncPSOKernel{Backend, T, G, H};
34+
maxiters = 100,
35+
w = 0.7298f0,
36+
wdamp = 1.0f0,
37+
debug = false) where {Backend <: CPU, T, G, H}
38+
backend = get_backend(gpu_particles)
39+
40+
update_particle_kernel = update_particle_states!(backend)
41+
42+
for i in 1:maxiters
43+
update_particle_kernel(prob,
44+
gpu_particles,
45+
gbest,
46+
w, opt;
47+
ndrange = length(gpu_particles))
48+
best_particle = minimum(gpu_particles)
49+
gbest = SPSOGBest(best_particle.position, best_particle.best_cost)
50+
w = w * wdamp
51+
end
52+
53+
return gbest, gpu_particles
54+
end
55+
3156
function vectorized_solve!(prob,
3257
gbest,
3358
gpu_particles, opt::ParallelPSOKernel, ::Val{true};

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[deps]
22
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
3+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
34
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
45
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
56
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

test/regression.jl

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
1+
using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random, KernelAbstractions
22

33
@testset "Rosenbrock test dimension = $(N)" for N in 2:3
44

@@ -36,6 +36,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
3636

3737
@test sol.objective < 1e-4
3838

39+
sol = solve(prob,
40+
ParallelPSOKernel(n_particles; backend = CPU()),
41+
maxiters = 500)
42+
43+
@test sol.objective < 1e-4
44+
45+
sol = solve(prob,
46+
ParallelSyncPSOKernel(n_particles; backend = CPU()),
47+
maxiters = 500)
48+
49+
@test sol.objective < 1e-4
50+
3951
lb = @SVector fill(Float32(-Inf), N)
4052
ub = @SVector fill(Float32(Inf), N)
4153

@@ -54,6 +66,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
5466

5567
@test sol.objective < 1e-4
5668

69+
sol = solve(prob,
70+
ParallelPSOKernel(n_particles; backend = CPU()),
71+
maxiters = 500)
72+
73+
@test sol.objective < 1e-4
74+
75+
sol = solve(prob,
76+
ParallelSyncPSOKernel(n_particles; backend = CPU()),
77+
maxiters = 500)
78+
79+
@test sol.objective < 1e-4
80+
5781
array_prob = remake(array_prob; lb = nothing, ub = nothing)
5882
prob = remake(prob; lb = nothing, ub = nothing)
5983

@@ -68,6 +92,18 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random
6892
maxiters = 500)
6993

7094
@test sol.objective < 1e-4
95+
96+
sol = solve(prob,
97+
ParallelPSOKernel(n_particles; backend = CPU()),
98+
maxiters = 500)
99+
100+
@test sol.objective < 1e-4
101+
102+
sol = solve(prob,
103+
ParallelSyncPSOKernel(n_particles; backend = CPU()),
104+
maxiters = 500)
105+
106+
@test sol.objective < 1e-4
71107
end
72108

73109
## Separate tests for N = 4 as the problem becomes non-convex and requires more iterations to converge
@@ -102,6 +138,18 @@ end
102138

103139
@test sol.objective < 2e-3
104140

141+
sol = solve(prob,
142+
ParallelPSOKernel(n_particles; backend = CPU()),
143+
maxiters = 1000)
144+
145+
@test sol.objective < 2e-3
146+
147+
sol = solve(prob,
148+
ParallelSyncPSOKernel(n_particles; backend = CPU()),
149+
maxiters = 1000)
150+
151+
@test sol.objective < 2e-3
152+
105153
lb = @SVector fill(Float32(-Inf), N)
106154
ub = @SVector fill(Float32(Inf), N)
107155

@@ -122,4 +170,16 @@ end
122170
maxiters = 1000)
123171

124172
@test sol.objective < 2e-3
173+
174+
sol = solve(prob,
175+
ParallelPSOKernel(n_particles; backend = CPU()),
176+
maxiters = 1000)
177+
178+
@test sol.objective < 2e-3
179+
180+
sol = solve(prob,
181+
ParallelSyncPSOKernel(n_particles; backend = CPU()),
182+
maxiters = 1000)
183+
184+
@test sol.objective < 2e-3
125185
end

0 commit comments

Comments
 (0)