Merge pull request #39 from SciML/u/shmem

utkarsh530 · web-flow · commit f07ee568e9cf · 2024-01-18T17:51:22.000-05:00
Update ParallelSyncPSO to use Shared Memory
diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl
@@ -38,6 +38,11 @@ function Base.isless(a::PSOGPU.SPSOParticle{T1, T2},
     a.best_cost < b.best_cost
 end
 
+function Base.isless(a::PSOGPU.SPSOGBest{T1, T2},
+        b::PSOGPU.SPSOGBest{T1, T2}) where {T1, T2}
+    a.cost < b.cost
+end
+
 function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
     PSOGPU.SPSOParticle{T1, T2}(similar(T1),
         similar(T1),
@@ -46,6 +51,11 @@ function Base.typemax(::Type{PSOGPU.SPSOParticle{T1, T2}}) where {T1, T2}
         typemax(T2))
 end
 
+function Base.typemax(::Type{PSOGPU.SPSOGBest{T1, T2}}) where {T1, T2}
+    PSOGPU.SPSOGBest{T1, T2}(similar(T1),
+        typemax(T2))
+end
+
 include("./algorithms.jl")
 include("./utils.jl")
 include("./ode_pso.jl")
diff --git a/src/kernels.jl b/src/kernels.jl
@@ -54,15 +54,48 @@ end
     @inbounds gpu_particles[i] = particle
 end
 
-@kernel function update_particle_states!(prob, gpu_particles, gbest, w,
+@kernel function update_particle_states!(prob,
+        gpu_particles::AbstractArray{SPSOParticle{T1, T2}}, block_particles, gbest, w,
         opt::ParallelSyncPSOKernel; c1 = 1.4962f0,
-        c2 = 1.4962f0)
+        c2 = 1.4962f0) where {T1, T2}
     i = @index(Global, Linear)
+    tidx = @index(Local, Linear)
+    gidx = @index(Group, Linear)
+
+    @uniform gs = @groupsize()[1]
+
+    group_particles = @localmem SPSOGBest{T1, T2} (gs)
+
+    if tidx == 1
+        fill!(group_particles, SPSOGBest(gbest.position, convert(typeof(gbest.cost), Inf)))
+    end
+
+    @synchronize
 
     @inbounds particle = gpu_particles[i]
 
     particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
 
+    @inbounds group_particles[tidx] = SPSOGBest(particle.best_position, particle.best_cost)
+
+    stride = gs ÷ 2
+
+    while stride >= 1
+        @synchronize
+        if tidx <= stride
+            @inbounds if group_particles[tidx].cost > group_particles[tidx + stride].cost
+                group_particles[tidx] = group_particles[tidx + stride]
+            end
+        end
+        stride = stride ÷ 2
+    end
+
+    @synchronize
+
+    if tidx == 1
+        @inbounds block_particles[gidx] = group_particles[tidx]
+    end
+
     @inbounds gpu_particles[i] = particle
 end
 
diff --git a/src/lowerlevel_solve.jl b/src/lowerlevel_solve.jl
@@ -7,16 +7,21 @@ function vectorized_solve!(prob,
         debug = false)
     backend = get_backend(gpu_particles)
 
-    update_particle_kernel = update_particle_states!(backend)
+    ## TODO: Get dynamic workgroupsize
+    workgroupsize = (min(length(gpu_particles), 1024),)
 
+    update_particle_kernel = update_particle_states!(backend, workgroupsize)
+
+    block_particles = KernelAbstractions.allocate(backend,
+        typeof(gbest),
+        cld(length(gpu_particles), workgroupsize[1]))
     for i in 1:maxiters
         update_particle_kernel(prob,
-            gpu_particles,
+            gpu_particles, block_particles,
             gbest,
             w, opt;
-            ndrange = length(gpu_particles))
-        best_particle = minimum(gpu_particles)
-        gbest = SPSOGBest(best_particle.position, best_particle.best_cost)
+            ndrange = length(gpu_particles), workgroupsize)
+        gbest = minimum(block_particles)
         w = w * wdamp
     end