JuliaGPU · brabreda · Aug 14, 2023 · Aug 14, 2023 · Sep 9, 2023 · Aug 28, 2023
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -139,10 +139,10 @@ function unsafe_free! end
 # - @groupsize
 # - @ndrange
 ###
-
 function groupsize end
 function ndrange end
 
+
 """
     @groupsize()
 
@@ -657,6 +657,7 @@ function __synchronize()
     error("@synchronize used outside kernel or not captured")
 end
 
+
 @generated function __print(items...)
     str = ""
     args = []
@@ -700,6 +701,7 @@ end
     @inbounds A[I] = B[I]
 end
 
+
 # CPU backend
 
 include("cpu.jl")
@@ -726,4 +728,7 @@ end
     end
 end
 
+# groupreduce
+include("reduce.jl")
+
 end #module
diff --git a/src/reduce.jl b/src/reduce.jl
@@ -0,0 +1,49 @@
+export @groupreduce
+
+"""
+
+    @groupreduce(op, val, neutral, use_subgroups)
+
+Reduce values across a block
+- `op`: the operator of the reduction
+- `val`: value that each thread contibutes to the values that need to be reduced
+- `netral`: value of the operator, so that `op(netural, neutral) = neutral``
+- `use_subgroups`: make use of the subgroupreduction of the groupreduction
+"""
+macro groupreduce(op, val, neutral) 
+    quote
+        $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))))
+    end
+end
+
+@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}) where {T}
+    idx_in_group = @index(Local)
+    groupsize = @groupsize()[1]
+
+    localmem = @localmem(T, groupsize)
+
+    @inbounds localmem[idx_in_group] = val
+
+    # perform the reduction
+    d = 1
+    while d < groupsize
+        @synchronize()
+        index = 2 * d * (idx_in_group-1) + 1
+        @inbounds if index <= groupsize
+            other_val = if index + d <= groupsize
+                localmem[index+d]
+            else
+                neutral
+            end
+            localmem[index] = op(localmem[index], other_val)
+        end
+        d *= 2
+    end
+
+    # load the final value on the first thread
+    if idx_in_group == 1
+        val = @inbounds localmem[idx_in_group]
+    end
+
+    return val 
+end
diff --git a/test/reduce.jl b/test/reduce.jl
@@ -0,0 +1,39 @@
+using KernelAbstractions, Test
+
+
+
+
+@kernel function reduce(a, b, op, neutral)
+    idx_in_group = @index(Local)
+
+    val = a[idx_in_group]
+
+    val = @groupreduce(op, val, netral)
+
+    b[1] = val 
+end
+
+function(backend, ArrayT)
+    @testset "groupreduce one group" begin
+        @testset for op in (+,*,max,min)
+            @testset for type in (Int32, Float32, Float64)
+                @test test_1group_groupreduce(backend, ArrayT ,op, type, op(neutral))
+            end
+        end
+    end
+end
+
+function test_1group_groupreduce(backend,ArrayT, op, type, neutral)
+    a = rand(type, 32)
+    b = ArrayT(a)  
+
+    c = similar(b,1)
+    reduce(a, c, op, neutral)
+
+    expected = mapreduce(x->x^2, +, a)
+    actual = c[1]
+    return expected = actual
+end
+
+
+