[Do not merge] Test KernelIntrinsics #2944

christiangnrd · 2025-10-22T14:56:20Z

[only tests]
[only benchmarks]

github-actions · 2025-10-22T14:57:06Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index bf4c26902..1f58b822e 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,28 +160,30 @@ end
 
 
 function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
-    kern = eval(quote
-        @cuda launch=false $(kwargs...) $(f)($(args...))
-    end)
-    KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+    kern = eval(
+        quote
+            @cuda launch = false $(kwargs...) $(f)($(args...))
+        end
+    )
+    return KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
-function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups = nothing, workgroupsize = nothing, kwargs...)
     threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
     threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
 
-    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
+    return obj.kern(args...; threads = threadsPerThreadgroup, blocks = threadgroupsPerGrid, kwargs...)
 end
 
 
-function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing
@@ -197,7 +199,7 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
     block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+    i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+    j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
-                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+            threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+        )
+        # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
-    "CPU synchronization",
-    "fallback test: callable types",]))
+Testsuite.testsuite(
+    () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+        [
+            "CPU synchronization",
+            "fallback test: callable types",
+        ]
+    )
+)
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end

[only tests] [only benchmarks]

src/mapreduce.jl

src/CUDAKernels.jl

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `59ae406`	Previous: `f4c05e0`	Ratio
`latency/precompile`	`64867251992` ns	`56743162658.5` ns	`1.14`
`latency/ttfp`	`8488265280` ns	`8292887489.5` ns	`1.02`
`latency/import`	`4665098033` ns	`4493784612` ns	`1.04`
`integration/volumerhs`	`9614512.5` ns	`9612835.5` ns	`1.00`
`integration/byval/slices=1`	`147155` ns	`146961` ns	`1.00`
`integration/byval/slices=3`	`426101` ns	`425977` ns	`1.00`
`integration/byval/reference`	`145147` ns	`145162` ns	`1.00`
`integration/byval/slices=2`	`286469` ns	`286531` ns	`1.00`
`integration/cudadevrt`	`103573` ns	`103664` ns	`1.00`
`kernel/indexing`	`14394` ns	`14225` ns	`1.01`
`kernel/indexing_checked`	`15187` ns	`14963.5` ns	`1.01`
`kernel/occupancy`	`689.5761589403974` ns	`712.5909090909091` ns	`0.97`
`kernel/launch`	`2201.277777777778` ns	`2140.1111111111113` ns	`1.03`
`kernel/rand`	`15651` ns	`17014` ns	`0.92`
`array/reverse/1d`	`20362` ns	`19857` ns	`1.03`
`array/reverse/2dL_inplace`	`66820` ns	`66720` ns	`1.00`
`array/reverse/1dL`	`70606` ns	`70068` ns	`1.01`
`array/reverse/2d`	`22290` ns	`21721` ns	`1.03`
`array/reverse/1d_inplace`	`9653` ns	`11535` ns	`0.84`
`array/reverse/2d_inplace`	`13308` ns	`13153` ns	`1.01`
`array/reverse/2dL`	`74188` ns	`73755` ns	`1.01`
`array/reverse/1dL_inplace`	`66833` ns	`66862` ns	`1.00`
`array/copy`	`21027` ns	`20647` ns	`1.02`
`array/iteration/findall/int`	`161830.5` ns	`158235` ns	`1.02`
`array/iteration/findall/bool`	`143572.5` ns	`139770.5` ns	`1.03`
`array/iteration/findfirst/int`	`162231.5` ns	`161047` ns	`1.01`
`array/iteration/findfirst/bool`	`162714` ns	`162113` ns	`1.00`
`array/iteration/scalar`	`71911.5` ns	`73378` ns	`0.98`
`array/iteration/logical`	`221153.5` ns	`216537` ns	`1.02`
`array/iteration/findmin/1d`	`51301` ns	`50322` ns	`1.02`
`array/iteration/findmin/2d`	`99173` ns	`96281.5` ns	`1.03`
`array/reductions/reduce/Int64/1d`	`49212.5` ns	`43275` ns	`1.14`
`array/reductions/reduce/Int64/dims=1`	`49611` ns	`44878` ns	`1.11`
`array/reductions/reduce/Int64/dims=2`	`69591` ns	`61376` ns	`1.13`
`array/reductions/reduce/Int64/dims=1L`	`89561` ns	`89018` ns	`1.01`
`array/reductions/reduce/Int64/dims=2L`	`90935` ns	`87717` ns	`1.04`
`array/reductions/reduce/Float32/1d`	`38238` ns	`36706` ns	`1.04`
`array/reductions/reduce/Float32/dims=1`	`46015` ns	`41841.5` ns	`1.10`
`array/reductions/reduce/Float32/dims=2`	`64996` ns	`59890` ns	`1.09`
`array/reductions/reduce/Float32/dims=1L`	`54055` ns	`52369` ns	`1.03`
`array/reductions/reduce/Float32/dims=2L`	`74319` ns	`71845` ns	`1.03`
`array/reductions/mapreduce/Int64/1d`	`48754.5` ns	`43034` ns	`1.13`
`array/reductions/mapreduce/Int64/dims=1`	`50284` ns	`44568` ns	`1.13`
`array/reductions/mapreduce/Int64/dims=2`	`69357` ns	`61598` ns	`1.13`
`array/reductions/mapreduce/Int64/dims=1L`	`89505` ns	`88831` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=2L`	`90953` ns	`88197` ns	`1.03`
`array/reductions/mapreduce/Float32/1d`	`37840` ns	`36550` ns	`1.04`
`array/reductions/mapreduce/Float32/dims=1`	`52225` ns	`51845` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2`	`63376` ns	`60046` ns	`1.06`
`array/reductions/mapreduce/Float32/dims=1L`	`54126` ns	`52895` ns	`1.02`
`array/reductions/mapreduce/Float32/dims=2L`	`73676` ns	`72274` ns	`1.02`
`array/broadcast`	`54517` ns	`20228` ns	`2.70`
`array/copyto!/gpu_to_gpu`	`11511` ns	`12997` ns	`0.89`
`array/copyto!/cpu_to_gpu`	`216507` ns	`214588` ns	`1.01`
`array/copyto!/gpu_to_cpu`	`283882.5` ns	`283061` ns	`1.00`
`array/accumulate/Int64/1d`	`128326.5` ns	`124766` ns	`1.03`
`array/accumulate/Int64/dims=1`	`85990` ns	`83121` ns	`1.03`
`array/accumulate/Int64/dims=2`	`159734.5` ns	`157489` ns	`1.01`
`array/accumulate/Int64/dims=1L`	`1784941` ns	`1708744` ns	`1.04`
`array/accumulate/Int64/dims=2L`	`973898.5` ns	`966369` ns	`1.01`
`array/accumulate/Float32/1d`	`112758` ns	`109029` ns	`1.03`
`array/accumulate/Float32/dims=1`	`82440` ns	`80115` ns	`1.03`
`array/accumulate/Float32/dims=2`	`150339` ns	`147066` ns	`1.02`
`array/accumulate/Float32/dims=1L`	`1704737` ns	`1617852.5` ns	`1.05`
`array/accumulate/Float32/dims=2L`	`713158` ns	`697700.5` ns	`1.02`
`array/construct`	`1308.3` ns	`1284.9` ns	`1.02`
`array/random/randn/Float32`	`45664.5` ns	`44088.5` ns	`1.04`
`array/random/randn!/Float32`	`25506` ns	`24724` ns	`1.03`
`array/random/rand!/Int64`	`27329` ns	`27197` ns	`1.00`
`array/random/rand!/Float32`	`8754.333333333334` ns	`8847.666666666666` ns	`0.99`
`array/random/rand/Int64`	`31225` ns	`29769` ns	`1.05`
`array/random/rand/Float32`	`13551` ns	`13169` ns	`1.03`
`array/permutedims/4d`	`63126` ns	`60066.5` ns	`1.05`
`array/permutedims/2d`	`54627.5` ns	`53803` ns	`1.02`
`array/permutedims/3d`	`57848` ns	`54690` ns	`1.06`
`array/sorting/1d`	`2758695` ns	`2756717` ns	`1.00`
`array/sorting/by`	`3345282` ns	`3343987` ns	`1.00`
`array/sorting/2d`	`1081541` ns	`1080056.5` ns	`1.00`
`cuda/synchronization/stream/auto`	`1028.5` ns	`1028.4` ns	`1.00`
`cuda/synchronization/stream/nonblocking`	`8041` ns	`7619.4` ns	`1.06`
`cuda/synchronization/stream/blocking`	`815.3011363636363` ns	`806.3333333333334` ns	`1.01`
`cuda/synchronization/context/auto`	`1172` ns	`1172.7` ns	`1.00`
`cuda/synchronization/context/nonblocking`	`7083.6` ns	`7177` ns	`0.99`
`cuda/synchronization/context/blocking`	`911.0350877192982` ns	`911.1923076923077` ns	`1.00`

This comment was automatically generated by workflow using github-action-benchmark.

This reverts commit 3fba940.

This reverts commit fc12cd1.

christiangnrd added 4 commits October 22, 2025 12:00

KernelIntrinsics

ab87611

Tweak

321590c

dogfood

d7a4712

[only tests] [only benchmarks]

CI

8e3e1d4

christiangnrd force-pushed the intrinsics branch from edaeb41 to 8e3e1d4 Compare October 22, 2025 15:00

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

Apply suggestion from @christiangnrd

fc71299

christiangnrd commented Oct 22, 2025

View reviewed changes

src/mapreduce.jl Outdated Show resolved Hide resolved

Apply suggestion from @christiangnrd

9e01e64

christiangnrd commented Oct 22, 2025

View reviewed changes

src/CUDAKernels.jl Outdated Show resolved Hide resolved

christiangnrd added 7 commits October 22, 2025 12:36

Apply suggestion from @christiangnrd

30cc26f

Fix

ac2b7fc

shtgbr

3fba940

Fix

1572f45

another fix

3c3f55c

fix

99d7ead

dfgbsg

506e02d

christiangnrd force-pushed the intrinsics branch from 497ef42 to 506e02d Compare October 22, 2025 17:23

christiangnrd added 2 commits October 22, 2025 14:36

dgbsg

9dc8adf

Finally?

20ee506

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd added 3 commits October 22, 2025 15:59

Revert "shtgbr"

fc12cd1

This reverts commit 3fba940.

Reapply "shtgbr"

8d218fb

This reverts commit fc12cd1.

Revert KIKernel

59ae406

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Do not merge] Test KernelIntrinsics #2944

[Do not merge] Test KernelIntrinsics #2944

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

[Do not merge] Test KernelIntrinsics #2944

Are you sure you want to change the base?

[Do not merge] Test KernelIntrinsics #2944

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading