Skip to content

Conversation

simeonschaub
Copy link
Contributor

ref #352

This matches the speed of the C implementation, so there seems to be no inherent overhead compared to OpenCL C:

BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  221.027 μs …  2.753 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     295.657 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   309.097 μs ± 74.218 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

          ▁▄▅▇▇█▇▆▄▄▂▁▁                                         
  ▂▂▁▂▃▃▄▆██████████████▆▆▆▆▆▅▄▄▄▄▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▄
  221 μs          Histogram: frequency by time          510 μs <

 Memory estimate: 2.42 KiB, allocs estimate: 56.

cc @maleadt

@simeonschaub simeonschaub marked this pull request as draft August 7, 2025 11:49
Copy link
Contributor

github-actions bot commented Aug 7, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/fast_sum.jl b/fast_sum.jl
index e7cba52..fec6952 100644
--- a/fast_sum.jl
+++ b/fast_sum.jl
@@ -12,7 +12,7 @@ function sum_columns_subgroup(X, result, M, N)
     end
 
     partial = 0.0f0
-    for row = row_thread:row_stride:M
+    for row in row_thread:row_stride:M
         idx = (col - 1) * M + row  # column-major layout
         partial += X[idx]
     end
@@ -32,9 +32,9 @@ function sum_columns_subgroup(X, result, M, N)
 
     # Only one thread writes result
     if lane == 1
-       Atomix.@atomic result[col] += partial
+        Atomix.@atomic result[col] += partial
     end
-    nothing
+    return nothing
 end
 
 
diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index 08e71e8..6d46c2e 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -58,12 +58,14 @@ end
 end
 
 for gentype in [Float32, Float64], as in atomic_memory_types
-@eval begin
+    @eval begin
 
-@device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_add", $gentype,
-                   (LLVMPtr{$gentype,$as}, $gentype), p, val)
-end
+        @device_function atomic_add!(p::LLVMPtr{$gentype, $as}, val::$gentype) =
+            @builtin_ccall(
+            "atomic_add", $gentype,
+            (LLVMPtr{$gentype, $as}, $gentype), p, val
+        )
+    end
 end
 
 
diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl
index d9919f2..3f6446b 100644
--- a/lib/intrinsics/src/work_item.jl
+++ b/lib/intrinsics/src/work_item.jl
@@ -39,41 +39,47 @@ end
 export sub_group_shuffle, sub_group_shuffle_xor
 
 for (jltype, llvmtype, julia_type_str) in [
-        (Int8,    "i8",    :Int8),
-        (UInt8,   "i8",    :UInt8),
-        (Int16,   "i16",   :Int16),
-        (UInt16,  "i16",   :UInt16),
-        (Int32,   "i32",   :Int32),
-        (UInt32,  "i32",   :UInt32),
-        (Int64,   "i64",   :Int64),
-        (UInt64,  "i64",   :UInt64),
-        (Float16, "half",  :Float16),
+        (Int8, "i8", :Int8),
+        (UInt8, "i8", :UInt8),
+        (Int16, "i16", :Int16),
+        (UInt16, "i16", :UInt16),
+        (Int32, "i32", :Int32),
+        (UInt32, "i32", :UInt32),
+        (Int64, "i64", :Int64),
+        (UInt64, "i64", :UInt64),
+        (Float16, "half", :Float16),
         (Float32, "float", :Float32),
-        (Float64, "double",:Float64)
+        (Float64, "double", :Float64),
     ]
     @eval begin
         export sub_group_shuffle, sub_group_shuffle_xor
         function sub_group_shuffle(x::$jltype, idx::Integer)
-            Base.llvmcall(
-                $("""
-                declare $llvmtype @__spirv_GroupNonUniformShuffle(i32, $llvmtype, i32)
-                define $llvmtype @entry($llvmtype %val, i32 %idx) #0 {
-                    %res = call $llvmtype @__spirv_GroupNonUniformShuffle(i32 3, $llvmtype %val, i32 %idx)
-                    ret $llvmtype %res
-                }
-                attributes #0 = { alwaysinline }
-                """, "entry"), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(idx))
+            return Base.llvmcall(
+                $(
+                    """
+                    declare $llvmtype @__spirv_GroupNonUniformShuffle(i32, $llvmtype, i32)
+                    define $llvmtype @entry($llvmtype %val, i32 %idx) #0 {
+                        %res = call $llvmtype @__spirv_GroupNonUniformShuffle(i32 3, $llvmtype %val, i32 %idx)
+                        ret $llvmtype %res
+                    }
+                    attributes #0 = { alwaysinline }
+                    """, "entry",
+                ), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(idx)
+            )
         end
         function sub_group_shuffle_xor(x::$jltype, mask::Integer)
-            Base.llvmcall(
-                $("""
-                declare $llvmtype @__spirv_GroupNonUniformShuffleXor(i32, $llvmtype, i32)
-                define $llvmtype @entry($llvmtype %val, i32 %mask) #0 {
-                    %res = call $llvmtype @__spirv_GroupNonUniformShuffleXor(i32 3, $llvmtype %val, i32 %mask)
-                    ret $llvmtype %res
-                }
-                attributes #0 = { alwaysinline }
-                """, "entry"), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(mask))
+            return Base.llvmcall(
+                $(
+                    """
+                    declare $llvmtype @__spirv_GroupNonUniformShuffleXor(i32, $llvmtype, i32)
+                    define $llvmtype @entry($llvmtype %val, i32 %mask) #0 {
+                        %res = call $llvmtype @__spirv_GroupNonUniformShuffleXor(i32 3, $llvmtype %val, i32 %mask)
+                        ret $llvmtype %res
+                    }
+                    attributes #0 = { alwaysinline }
+                    """, "entry",
+                ), $julia_type_str, Tuple{$julia_type_str, Int32}, x, Int32(mask)
+            )
         end
     end
 end

Copy link

codecov bot commented Aug 7, 2025

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 78.86%. Comparing base (a387890) to head (6eac710).

Additional details and impacted files
@@           Coverage Diff           @@
##           master     #356   +/-   ##
=======================================
  Coverage   78.86%   78.86%           
=======================================
  Files          12       12           
  Lines         672      672           
=======================================
  Hits          530      530           
  Misses        142      142           

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@maleadt
Copy link
Member

maleadt commented Aug 7, 2025

This matches the speed of the C implementation, so there seems to be no inherent overhead compared to OpenCL C:

That's great to hear, at least. I presume that the cartesian indexing introduced by the more complicated mapreducedim! kernel (or by KA.jl for all kernels) is going to take a hit.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants