[WIP] faster sum #356

simeonschaub · 2025-08-07T11:49:11Z

This matches the speed of the C implementation, so there seems to be no inherent overhead compared to OpenCL C:

BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  221.027 μs …  2.753 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     295.657 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   309.097 μs ± 74.218 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

          ▁▄▅▇▇█▇▆▄▄▂▁▁                                         
  ▂▂▁▂▃▃▄▆██████████████▆▆▆▆▆▅▄▄▄▄▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▄
  221 μs          Histogram: frequency by time          510 μs <

 Memory estimate: 2.42 KiB, allocs estimate: 56.

cc @maleadt

github-actions · 2025-08-07T11:50:30Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/fast_sum.jl b/fast_sum.jl
index e7cba52..fec6952 100644
--- a/fast_sum.jl
+++ b/fast_sum.jl
@@ -12,7 +12,7 @@ function sum_columns_subgroup(X, result, M, N)
     end
 
     partial = 0.0f0
-    for row = row_thread:row_stride:M
+    for row in row_thread:row_stride:M
         idx = (col - 1) * M + row  # column-major layout
         partial += X[idx]
     end
@@ -32,9 +32,9 @@ function sum_columns_subgroup(X, result, M, N)
 
     # Only one thread writes result
     if lane == 1
-       Atomix.@atomic result[col] += partial
+        Atomix.@atomic result[col] += partial
     end
-    nothing
+    return nothing
 end
 
 
diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index 08e71e8..6d46c2e 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -58,12 +58,14 @@ end
 end
 
 for gentype in [Float32, Float64], as in atomic_memory_types
-@eval begin
+    @eval begin
 
-@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_add", $gentype,
-                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
-end
+        @device_function atomic_add!(p::LLVMPtr{$gentype, $as}, val::$gentype) =
+            @builtin_ccall(
+            "atomic_add", $gentype,
+            (LLVMPtr{$gentype, $as}, $gentype), p, val
+        )
+    end
 end
 
 
diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl
index d9919f2..3f6446b 100644
--- a/lib/intrinsics/src/work_item.jl
+++ b/lib/intrinsics/src/work_item.jl
@@ -39,41 +39,47 @@ end
 export sub_group_shuffle, sub_group_shuffle_xor
 
 for (jltype, llvmtype, julia_type_str) in [
-        (Int8,    "i8",    :Int8),
-        (UInt8,   "i8",    :UInt8),
-        (Int16,   "i16",   :Int16),
-        (UInt16,  "i16",   :UInt16),
-        (Int32,   "i32",   :Int32),
-        (UInt32,  "i32",   :UInt32),
-        (Int64,   "i64",   :Int64),
-        (UInt64,  "i64",   :UInt64),
-        (Float16, "half",  :Float16),
+        (Int8, "i8", :Int8),
+        (UInt8, "i8", :UInt8),
+        (Int16, "i16", :Int16),
+        (UInt16, "i16", :UInt16),
+        (Int32, "i32", :Int32),
+        (UInt32, "i32", :UInt32),
+        (Int64, "i64", :Int64),
+        (UInt64, "i64", :UInt64),
+        (Float16, "half", :Float16),
         (Float32, "float", :Float32),
-        (Float64, "double",:Float64)
+        (Float64, "double", :Float64),
     ]
     @eval begin
         export sub_group_shuffle, sub_group_shuffle_xor
         function sub_group_shuffle(x::$jltype, idx::Integer)
-            Base.llvmcall(
-                $("""
-                declare $llvmtype @__spirv_GroupNonUniformShuffle(i32, $llvmtype, i32)
-                define $llvmtype @entry($llvmtype %val, i32 %idx) #0 {
-                    %res = call $llvmtype @__spirv_GroupNonUniformShuffle(i32 3, $llvmtype %val, i32 %idx)
-                    ret $llvmtype %res
-                }
-                attributes #0 = { alwaysinline }
-                """, "entry"), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(idx))
+            return Base.llvmcall(
+                $(
+                    """
+                    declare $llvmtype @__spirv_GroupNonUniformShuffle(i32, $llvmtype, i32)
+                    define $llvmtype @entry($llvmtype %val, i32 %idx) #0 {
+                        %res = call $llvmtype @__spirv_GroupNonUniformShuffle(i32 3, $llvmtype %val, i32 %idx)
+                        ret $llvmtype %res
+                    }
+                    attributes #0 = { alwaysinline }
+                    """, "entry",
+                ), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(idx)
+            )
         end
         function sub_group_shuffle_xor(x::$jltype, mask::Integer)
-            Base.llvmcall(
-                $("""
-                declare $llvmtype @__spirv_GroupNonUniformShuffleXor(i32, $llvmtype, i32)
-                define $llvmtype @entry($llvmtype %val, i32 %mask) #0 {
-                    %res = call $llvmtype @__spirv_GroupNonUniformShuffleXor(i32 3, $llvmtype %val, i32 %mask)
-                    ret $llvmtype %res
-                }
-                attributes #0 = { alwaysinline }
-                """, "entry"), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(mask))
+            return Base.llvmcall(
+                $(
+                    """
+                    declare $llvmtype @__spirv_GroupNonUniformShuffleXor(i32, $llvmtype, i32)
+                    define $llvmtype @entry($llvmtype %val, i32 %mask) #0 {
+                        %res = call $llvmtype @__spirv_GroupNonUniformShuffleXor(i32 3, $llvmtype %val, i32 %mask)
+                        ret $llvmtype %res
+                    }
+                    attributes #0 = { alwaysinline }
+                    """, "entry",
+                ), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(mask)
+            )
         end
     end
 end

codecov · 2025-08-07T11:56:10Z

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 78.86%. Comparing base (a387890) to head (6eac710).
⚠️ Report is 27 commits behind head on master.

Additional details and impacted files

@@           Coverage Diff           @@
##           master     #356   +/-   ##
=======================================
  Coverage   78.86%   78.86%           
=======================================
  Files          12       12           
  Lines         672      672           
=======================================
  Hits          530      530           
  Misses        142      142

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

maleadt · 2025-08-07T19:52:51Z

This matches the speed of the C implementation, so there seems to be no inherent overhead compared to OpenCL C:

That's great to hear, at least. I presume that the cartesian indexing introduced by the more complicated mapreducedim! kernel (or by KA.jl for all kernels) is going to take a hit.

[WIP] faster sum

6eac710

simeonschaub marked this pull request as draft August 7, 2025 11:49

simeonschaub mentioned this pull request Sep 16, 2025

Support subgroups #284

Open

simeonschaub mentioned this pull request Oct 13, 2025

optimized mapreduce using sub group shuffle #383

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[WIP] faster sum #356

[WIP] faster sum #356

Uh oh!

simeonschaub commented Aug 7, 2025

Uh oh!

github-actions bot commented Aug 7, 2025

Uh oh!

codecov bot commented Aug 7, 2025 •

edited

Loading

Uh oh!

maleadt commented Aug 7, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[WIP] faster sum #356

Are you sure you want to change the base?

[WIP] faster sum #356

Uh oh!

Conversation

simeonschaub commented Aug 7, 2025

Uh oh!

github-actions bot commented Aug 7, 2025

Uh oh!

codecov bot commented Aug 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

maleadt commented Aug 7, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

codecov bot commented Aug 7, 2025 •

edited

Loading