Skip to content

Commit f07ee56

Browse files
authored
Merge pull request #39 from SciML/u/shmem
Update ParallelSyncPSO to use Shared Memory
2 parents 9960c48 + d8cb83b commit f07ee56

File tree

3 files changed

+55
-7
lines changed

3 files changed

+55
-7
lines changed

src/PSOGPU.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
3838
a.best_cost < b.best_cost
3939
end
4040

41+
function Base.isless(a::PSOGPU.SPSOGBest{T1, T2},
42+
b::PSOGPU.SPSOGBest{T1, T2}) where {T1, T2}
43+
a.cost < b.cost
44+
end
45+
4146
function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
4247
PSOGPU.SPSOParticle{T1, T2}(similar(T1),
4348
similar(T1),
@@ -46,6 +51,11 @@ function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
4651
typemax(T2))
4752
end
4853

54+
function Base.typemax(::Type{PSOGPU.SPSOGBest{T1, T2}}) where {T1, T2}
55+
PSOGPU.SPSOGBest{T1, T2}(similar(T1),
56+
typemax(T2))
57+
end
58+
4959
include("./algorithms.jl")
5060
include("./utils.jl")
5161
include("./ode_pso.jl")

src/kernels.jl

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,48 @@ end
5454
@inbounds gpu_particles[i] = particle
5555
end
5656

57-
@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
57+
@kernel function update_particle_states!(prob,
58+
gpu_particles::AbstractArray{SPSOParticle{T1, T2}}, block_particles, gbest, w,
5859
opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
59-
c2 = 1.4962f0)
60+
c2 = 1.4962f0) where {T1, T2}
6061
i = @index(Global, Linear)
62+
tidx = @index(Local, Linear)
63+
gidx = @index(Group, Linear)
64+
65+
@uniform gs = @groupsize()[1]
66+
67+
group_particles = @localmem SPSOGBest{T1, T2} (gs)
68+
69+
if tidx == 1
70+
fill!(group_particles, SPSOGBest(gbest.position, convert(typeof(gbest.cost), Inf)))
71+
end
72+
73+
@synchronize
6174

6275
@inbounds particle = gpu_particles[i]
6376

6477
particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
6578

79+
@inbounds group_particles[tidx] = SPSOGBest(particle.best_position, particle.best_cost)
80+
81+
stride = gs ÷ 2
82+
83+
while stride >= 1
84+
@synchronize
85+
if tidx <= stride
86+
@inbounds if group_particles[tidx].cost > group_particles[tidx + stride].cost
87+
group_particles[tidx] = group_particles[tidx + stride]
88+
end
89+
end
90+
stride = stride ÷ 2
91+
end
92+
93+
@synchronize
94+
95+
if tidx == 1
96+
@inbounds block_particles[gidx] = group_particles[tidx]
97+
end
98+
6699
@inbounds gpu_particles[i] = particle
67100
end
68101

src/lowerlevel_solve.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,21 @@ function vectorized_solve!(prob,
77
debug = false)
88
backend = get_backend(gpu_particles)
99

10-
update_particle_kernel = update_particle_states!(backend)
10+
## TODO: Get dynamic workgroupsize
11+
workgroupsize = (min(length(gpu_particles), 1024),)
1112

13+
update_particle_kernel = update_particle_states!(backend, workgroupsize)
14+
15+
block_particles = KernelAbstractions.allocate(backend,
16+
typeof(gbest),
17+
cld(length(gpu_particles), workgroupsize[1]))
1218
for i in 1:maxiters
1319
update_particle_kernel(prob,
14-
gpu_particles,
20+
gpu_particles, block_particles,
1521
gbest,
1622
w, opt;
17-
ndrange = length(gpu_particles))
18-
best_particle = minimum(gpu_particles)
19-
gbest = SPSOGBest(best_particle.position, best_particle.best_cost)
23+
ndrange = length(gpu_particles), workgroupsize)
24+
gbest = minimum(block_particles)
2025
w = w * wdamp
2126
end
2227

0 commit comments

Comments
 (0)