-
Notifications
You must be signed in to change notification settings - Fork 254
[Do not merge] Test KernelIntrinsics #2944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
christiangnrd
wants to merge
18
commits into
JuliaGPU:master
Choose a base branch
from
christiangnrd:intrinsics
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
+539
−486
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index bf4c26902..1f58b822e 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -160,28 +160,30 @@ end
function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
- kern = eval(quote
- @cuda launch=false $(kwargs...) $(f)($(args...))
- end)
- KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+ kern = eval(
+ quote
+ @cuda launch = false $(kwargs...) $(f)($(args...))
+ end
+ )
+ return KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
end
-function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups = nothing, workgroupsize = nothing, kwargs...)
threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
- obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
+ return obj.kern(args...; threads = threadsPerThreadgroup, blocks = threadgroupsPerGrid, kwargs...)
end
-function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
end
function KI.multiprocessor_count(::CUDABackend)::Int
- Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+ return Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
end
## indexing
@@ -197,7 +199,7 @@ end
end
@device_override @inline function KI.get_global_id()
- return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
+ return (; x = Int((blockIdx().x - 1) * blockDim().x + threadIdx().x), y = Int((blockIdx().y - 1) * blockDim().y + threadIdx().y), z = Int((blockIdx().z - 1) * blockDim().z + threadIdx().z))
end
@device_override @inline function KI.get_local_size()
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 051ecc11e..d238bb8a2 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -22,9 +22,9 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
temp = CuDynamicSharedArray(T, (2*threads,))
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
if j > length(Rother)
return
@@ -105,9 +105,9 @@ function aggregate_partial_scan(op::Function, output::AbstractArray,
block = KI.get_group_id().x
# iterate the main dimension using threads and the first block dimension
- i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
+ i = (KI.get_group_id().x - 1i32) * KI.get_local_size().x + KI.get_local_id().x
# iterate the other dimensions using the remaining block dimensions
- j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
+ j = (KI.get_group_id().z - 1i32) * KI.get_num_groups().y + KI.get_group_id().y
@inbounds if i <= length(Rdim) && j <= length(Rother)
I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index 7d72d90a1..063c736ed 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -73,8 +73,8 @@ end
@inbounds global_random_counters()[warpId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId%UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 97a4176b4..6fccff91e 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -294,8 +294,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
end
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
- threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
- # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+ threads = partial_threads, blocks = partial_blocks, shmem = partial_shmem
+ )
+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
GPUArrays.mapreducedim!(identity, op, R, partial; init)
end
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2f2c4300b..1e674d3be 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,9 +4,14 @@ using SparseArrays
include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
- "CPU synchronization",
- "fallback test: callable types",]))
+Testsuite.testsuite(
+ () -> CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests = Set(
+ [
+ "CPU synchronization",
+ "fallback test: callable types",
+ ]
+ )
+)
for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
end |
edaeb41
to
8e3e1d4
Compare
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
christiangnrd
commented
Oct 22, 2025
497ef42
to
506e02d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: 59ae406 | Previous: f4c05e0 | Ratio |
---|---|---|---|
latency/precompile |
64867251992 ns |
56743162658.5 ns |
1.14 |
latency/ttfp |
8488265280 ns |
8292887489.5 ns |
1.02 |
latency/import |
4665098033 ns |
4493784612 ns |
1.04 |
integration/volumerhs |
9614512.5 ns |
9612835.5 ns |
1.00 |
integration/byval/slices=1 |
147155 ns |
146961 ns |
1.00 |
integration/byval/slices=3 |
426101 ns |
425977 ns |
1.00 |
integration/byval/reference |
145147 ns |
145162 ns |
1.00 |
integration/byval/slices=2 |
286469 ns |
286531 ns |
1.00 |
integration/cudadevrt |
103573 ns |
103664 ns |
1.00 |
kernel/indexing |
14394 ns |
14225 ns |
1.01 |
kernel/indexing_checked |
15187 ns |
14963.5 ns |
1.01 |
kernel/occupancy |
689.5761589403974 ns |
712.5909090909091 ns |
0.97 |
kernel/launch |
2201.277777777778 ns |
2140.1111111111113 ns |
1.03 |
kernel/rand |
15651 ns |
17014 ns |
0.92 |
array/reverse/1d |
20362 ns |
19857 ns |
1.03 |
array/reverse/2dL_inplace |
66820 ns |
66720 ns |
1.00 |
array/reverse/1dL |
70606 ns |
70068 ns |
1.01 |
array/reverse/2d |
22290 ns |
21721 ns |
1.03 |
array/reverse/1d_inplace |
9653 ns |
11535 ns |
0.84 |
array/reverse/2d_inplace |
13308 ns |
13153 ns |
1.01 |
array/reverse/2dL |
74188 ns |
73755 ns |
1.01 |
array/reverse/1dL_inplace |
66833 ns |
66862 ns |
1.00 |
array/copy |
21027 ns |
20647 ns |
1.02 |
array/iteration/findall/int |
161830.5 ns |
158235 ns |
1.02 |
array/iteration/findall/bool |
143572.5 ns |
139770.5 ns |
1.03 |
array/iteration/findfirst/int |
162231.5 ns |
161047 ns |
1.01 |
array/iteration/findfirst/bool |
162714 ns |
162113 ns |
1.00 |
array/iteration/scalar |
71911.5 ns |
73378 ns |
0.98 |
array/iteration/logical |
221153.5 ns |
216537 ns |
1.02 |
array/iteration/findmin/1d |
51301 ns |
50322 ns |
1.02 |
array/iteration/findmin/2d |
99173 ns |
96281.5 ns |
1.03 |
array/reductions/reduce/Int64/1d |
49212.5 ns |
43275 ns |
1.14 |
array/reductions/reduce/Int64/dims=1 |
49611 ns |
44878 ns |
1.11 |
array/reductions/reduce/Int64/dims=2 |
69591 ns |
61376 ns |
1.13 |
array/reductions/reduce/Int64/dims=1L |
89561 ns |
89018 ns |
1.01 |
array/reductions/reduce/Int64/dims=2L |
90935 ns |
87717 ns |
1.04 |
array/reductions/reduce/Float32/1d |
38238 ns |
36706 ns |
1.04 |
array/reductions/reduce/Float32/dims=1 |
46015 ns |
41841.5 ns |
1.10 |
array/reductions/reduce/Float32/dims=2 |
64996 ns |
59890 ns |
1.09 |
array/reductions/reduce/Float32/dims=1L |
54055 ns |
52369 ns |
1.03 |
array/reductions/reduce/Float32/dims=2L |
74319 ns |
71845 ns |
1.03 |
array/reductions/mapreduce/Int64/1d |
48754.5 ns |
43034 ns |
1.13 |
array/reductions/mapreduce/Int64/dims=1 |
50284 ns |
44568 ns |
1.13 |
array/reductions/mapreduce/Int64/dims=2 |
69357 ns |
61598 ns |
1.13 |
array/reductions/mapreduce/Int64/dims=1L |
89505 ns |
88831 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
90953 ns |
88197 ns |
1.03 |
array/reductions/mapreduce/Float32/1d |
37840 ns |
36550 ns |
1.04 |
array/reductions/mapreduce/Float32/dims=1 |
52225 ns |
51845 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2 |
63376 ns |
60046 ns |
1.06 |
array/reductions/mapreduce/Float32/dims=1L |
54126 ns |
52895 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2L |
73676 ns |
72274 ns |
1.02 |
array/broadcast |
54517 ns |
20228 ns |
2.70 |
array/copyto!/gpu_to_gpu |
11511 ns |
12997 ns |
0.89 |
array/copyto!/cpu_to_gpu |
216507 ns |
214588 ns |
1.01 |
array/copyto!/gpu_to_cpu |
283882.5 ns |
283061 ns |
1.00 |
array/accumulate/Int64/1d |
128326.5 ns |
124766 ns |
1.03 |
array/accumulate/Int64/dims=1 |
85990 ns |
83121 ns |
1.03 |
array/accumulate/Int64/dims=2 |
159734.5 ns |
157489 ns |
1.01 |
array/accumulate/Int64/dims=1L |
1784941 ns |
1708744 ns |
1.04 |
array/accumulate/Int64/dims=2L |
973898.5 ns |
966369 ns |
1.01 |
array/accumulate/Float32/1d |
112758 ns |
109029 ns |
1.03 |
array/accumulate/Float32/dims=1 |
82440 ns |
80115 ns |
1.03 |
array/accumulate/Float32/dims=2 |
150339 ns |
147066 ns |
1.02 |
array/accumulate/Float32/dims=1L |
1704737 ns |
1617852.5 ns |
1.05 |
array/accumulate/Float32/dims=2L |
713158 ns |
697700.5 ns |
1.02 |
array/construct |
1308.3 ns |
1284.9 ns |
1.02 |
array/random/randn/Float32 |
45664.5 ns |
44088.5 ns |
1.04 |
array/random/randn!/Float32 |
25506 ns |
24724 ns |
1.03 |
array/random/rand!/Int64 |
27329 ns |
27197 ns |
1.00 |
array/random/rand!/Float32 |
8754.333333333334 ns |
8847.666666666666 ns |
0.99 |
array/random/rand/Int64 |
31225 ns |
29769 ns |
1.05 |
array/random/rand/Float32 |
13551 ns |
13169 ns |
1.03 |
array/permutedims/4d |
63126 ns |
60066.5 ns |
1.05 |
array/permutedims/2d |
54627.5 ns |
53803 ns |
1.02 |
array/permutedims/3d |
57848 ns |
54690 ns |
1.06 |
array/sorting/1d |
2758695 ns |
2756717 ns |
1.00 |
array/sorting/by |
3345282 ns |
3343987 ns |
1.00 |
array/sorting/2d |
1081541 ns |
1080056.5 ns |
1.00 |
cuda/synchronization/stream/auto |
1028.5 ns |
1028.4 ns |
1.00 |
cuda/synchronization/stream/nonblocking |
8041 ns |
7619.4 ns |
1.06 |
cuda/synchronization/stream/blocking |
815.3011363636363 ns |
806.3333333333334 ns |
1.01 |
cuda/synchronization/context/auto |
1172 ns |
1172.7 ns |
1.00 |
cuda/synchronization/context/nonblocking |
7083.6 ns |
7177 ns |
0.99 |
cuda/synchronization/context/blocking |
911.0350877192982 ns |
911.1923076923077 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
[only tests]
[only benchmarks]