Skip to content

Commit 84b736f

Browse files
authored
Merge branch 'main' into vc/printf
2 parents 3356f64 + 242adcf commit 84b736f

File tree

15 files changed

+134
-82
lines changed

15 files changed

+134
-82
lines changed

.buildkite/pipeline.yml

Lines changed: 65 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@ steps:
1111
- JuliaCI/julia-coverage#v1:
1212
codecov: true
1313
command: |
14+
julia -e 'println("--- :julia: Developing CUDA")
15+
using Pkg
16+
Pkg.develop(; name="CUDA")
17+
Pkg.develop(; name="GPUArrays")'
18+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/CUDA/Project.toml
19+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/GPUArrays/Project.toml
1420
julia -e 'println("--- :julia: Instantiating project")
1521
using Pkg
16-
Pkg.develop(; path=pwd())
17-
Pkg.develop(; name="CUDA")' || exit 3
22+
Pkg.develop(; path=pwd())' || exit 3
1823
1924
julia -e 'println("+++ :julia: Running tests")
2025
using Pkg
@@ -25,38 +30,40 @@ steps:
2530
timeout_in_minutes: 120
2631
soft_fail:
2732
- exit_status: 3
33+
env:
34+
JULIA_PROJECT: "@cuda"
2835

29-
- label: "CUDA Enzyme Julia {{matrix.version}}"
30-
matrix:
31-
setup:
32-
version:
33-
- "1.10"
34-
- "1.11"
35-
plugins:
36-
- JuliaCI/julia#v1:
37-
version: "{{matrix.version}}"
38-
- JuliaCI/julia-coverage#v1:
39-
codecov: true
40-
command: |
41-
julia -e 'println("--- :julia: Instantiating project")
42-
using Pkg
43-
try
44-
Pkg.develop([PackageSpec(; path=pwd()), PackageSpec("Enzyme"), PackageSpec("EnzymeCore"), PackageSpec("CUDA")])
45-
catch err
46-
Pkg.develop(; path=pwd())
47-
Pkg.add(["CUDA", "Enzyme"])
48-
end' || exit 3
36+
# - label: "CUDA Enzyme Julia {{matrix.version}}"
37+
# matrix:
38+
# setup:
39+
# version:
40+
# - "1.10"
41+
# - "1.11"
42+
# plugins:
43+
# - JuliaCI/julia#v1:
44+
# version: "{{matrix.version}}"
45+
# - JuliaCI/julia-coverage#v1:
46+
# codecov: true
47+
# command: |
48+
# julia -e 'println("--- :julia: Instantiating project")
49+
# using Pkg
50+
# try
51+
# Pkg.develop([PackageSpec(; path=pwd()), PackageSpec("Enzyme"), PackageSpec("EnzymeCore"), PackageSpec("CUDA")])
52+
# catch err
53+
# Pkg.develop(; path=pwd())
54+
# Pkg.add(["CUDA", "Enzyme"])
55+
# end' || exit 3
4956

50-
julia -e 'println("+++ :julia: Running tests")
51-
using CUDA
52-
include("test/extensions/enzyme.jl")
53-
enzyme_testsuite(CUDABackend, CuArray, true)'
54-
agents:
55-
queue: "juliagpu"
56-
cuda: "*"
57-
timeout_in_minutes: 120
58-
soft_fail:
59-
- exit_status: 3
57+
# julia -e 'println("+++ :julia: Running tests")
58+
# using CUDA
59+
# include("test/extensions/enzyme.jl")
60+
# enzyme_testsuite(CUDABackend, CuArray, true)'
61+
# agents:
62+
# queue: "juliagpu"
63+
# cuda: "*"
64+
# timeout_in_minutes: 120
65+
# soft_fail:
66+
# - exit_status: 3
6067

6168
- label: "Metal Julia {{matrix.version}}"
6269
matrix:
@@ -70,10 +77,15 @@ steps:
7077
- JuliaCI/julia-coverage#v1:
7178
codecov: true
7279
command: |
80+
julia -e 'println("--- :julia: Developing Metal")
81+
using Pkg
82+
Pkg.develop(; name="Metal")
83+
Pkg.develop(; name="GPUArrays")'
84+
sed -i '' 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/Metal/Project.toml
85+
sed -i '' 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/GPUArrays/Project.toml
7386
julia -e 'println("--- :julia: Instantiating project")
7487
using Pkg
75-
Pkg.develop(; path=pwd())
76-
Pkg.develop(; name="Metal")' || exit 3
88+
Pkg.develop(; path=pwd())' || exit 3
7789
7890
julia -e 'println("+++ :julia: Running tests")
7991
using Pkg
@@ -85,6 +97,8 @@ steps:
8597
timeout_in_minutes: 120
8698
soft_fail:
8799
- exit_status: 3
100+
env:
101+
JULIA_PROJECT: "@metal"
88102

89103
- label: "oneAPI Julia {{matrix.version}}"
90104
matrix:
@@ -98,10 +112,15 @@ steps:
98112
- JuliaCI/julia-coverage#v1:
99113
codecov: true
100114
command: |
115+
julia -e 'println("--- :julia: Developing oneAPI")
116+
using Pkg
117+
Pkg.develop(; name="oneAPI")
118+
Pkg.develop(; name="GPUArrays")'
119+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/oneAPI/Project.toml
120+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/GPUArrays/Project.toml
101121
julia -e 'println("--- :julia: Instantiating project")
102122
using Pkg
103-
Pkg.develop(; path=pwd())
104-
Pkg.develop(; name="oneAPI")' || exit 3
123+
Pkg.develop(; path=pwd())' || exit 3
105124
106125
julia -e 'println("+++ :julia: Running tests")
107126
using Pkg
@@ -112,6 +131,8 @@ steps:
112131
timeout_in_minutes: 120
113132
soft_fail:
114133
- exit_status: 3
134+
env:
135+
JULIA_PROJECT: "@oneAPI"
115136

116137
- label: "AMDGPU Julia {{matrix.version}}"
117138
matrix:
@@ -125,10 +146,15 @@ steps:
125146
- JuliaCI/julia-coverage#v1:
126147
codecov: true
127148
command: |
149+
julia -e 'println("--- :julia: Developing AMDGPU")
150+
using Pkg
151+
Pkg.develop(; name="AMDGPU")
152+
Pkg.develop(; name="GPUArrays")'
153+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AMDGPU/Project.toml
154+
sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/GPUArrays/Project.toml
128155
julia -e 'println("--- :julia: Instantiating project")
129156
using Pkg
130-
Pkg.develop(; path=pwd())
131-
Pkg.develop(; name="AMDGPU")' || exit 3
157+
Pkg.develop(; path=pwd())' || exit 3
132158
133159
julia -e 'println("+++ :julia: Running tests")
134160
using Pkg
@@ -140,6 +166,7 @@ steps:
140166
soft_fail:
141167
- exit_status: 3
142168
env:
169+
JULIA_PROJECT: "@amdgpu"
143170
JULIA_NUM_THREADS: 4
144171

145172
env:

Project.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
1414
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
1515
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
1616
SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
17+
SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c"
18+
SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
1719
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
1820
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
1921
pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
@@ -32,13 +34,15 @@ SparseArraysExt = "SparseArrays"
3234
Adapt = "0.4, 1.0, 2.0, 3.0, 4"
3335
Atomix = "0.1, 1"
3436
EnzymeCore = "0.7, 0.8.1"
35-
GPUCompiler = "1"
37+
GPUCompiler = "1.2"
3638
InteractiveUtils = "1.6"
3739
LLVM = "9"
3840
LinearAlgebra = "1.6"
3941
MacroTools = "0.5"
4042
PrecompileTools = "1"
41-
SPIRVIntrinsics = "0.2.1"
43+
SPIRVIntrinsics = "0.3"
44+
SPIRV_LLVM_Backend_jll = "20"
45+
SPIRV_Tools_jll = "2024.4, 2025.1"
4246
SparseArrays = "<0.0.1, 1.6"
4347
StaticArrays = "0.12, 1.0"
4448
UUIDs = "<0.0.1, 1.6"

benchmark/benchmarks.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@ using Random
99

1010
if !haskey(ENV, "KA_BACKEND")
1111
const BACKEND = CPU()
12+
const Ts = (Float32, Float64)
1213
else
1314
backend = ENV["KA_BACKEND"]
1415
if backend == "CPU"
1516
const BACKEND = CPU()
17+
const Ts = (Float32, Float64)
1618
elseif backend == "CUDA"
1719
using CUDA
1820
const BACKEND = CUDABackend()
21+
const Ts = (Float16, Float32, Float64)
1922
else
2023
error("Backend $backend not recognized")
2124
end
@@ -31,7 +34,7 @@ end
3134
SUITE["saxpy"] = BenchmarkGroup()
3235

3336
let static = BenchmarkGroup()
34-
for T in (Float16, Float32, Float64)
37+
for T in Ts
3538
dtype = BenchmarkGroup()
3639
for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
3740
dtype[N] = @benchmarkable begin
@@ -49,7 +52,7 @@ let static = BenchmarkGroup()
4952
end
5053

5154
let default = BenchmarkGroup()
52-
for T in (Float16, Float32, Float64)
55+
for T in Ts
5356
dtype = BenchmarkGroup()
5457
for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
5558
dtype[N] = @benchmarkable begin

examples/histogram.jl

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,29 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
55

66
# Function to use as a baseline for CPU metrics
77
function create_histogram(input)
8-
histogram_output = zeros(Int, maximum(input))
8+
histogram_output = zeros(eltype(input), maximum(input))
99
for i in input
1010
histogram_output[i] += 1
1111
end
1212
return histogram_output
1313
end
1414

1515
# This a 1D histogram kernel where the histogramming happens on shmem
16-
@kernel function histogram_kernel!(histogram_output, input)
17-
tid = @index(Global, Linear)
16+
@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
17+
gid = @index(Group, Linear)
1818
lid = @index(Local, Linear)
1919

20-
@uniform warpsize = Int(32)
21-
22-
@uniform gs = @groupsize()[1]
20+
@uniform gs = prod(@groupsize())
21+
tid = (gid - 1) * gs + lid
2322
@uniform N = length(histogram_output)
2423

25-
shared_histogram = @localmem Int (gs)
24+
shared_histogram = @localmem eltype(input) (gs)
2625

2726
# This will go through all input elements and assign them to a location in
2827
# shmem. Note that if there is not enough shem, we create different shmem
2928
# blocks to write to. For example, if shmem is of size 256, but it's
3029
# possible to get a value of 312, then we will have 2 separate shmem blocks,
3130
# one from 1->256, and another from 256->512
32-
@uniform max_element = 1
3331
for min_element in 1:gs:N
3432

3533
# Setting shared_histogram to 0
@@ -42,7 +40,7 @@ end
4240
end
4341

4442
# Defining bin on shared memory and writing to it if possible
45-
bin = input[tid]
43+
bin = tid <= length(input) ? input[tid] : 0
4644
if bin >= min_element && bin < max_element
4745
bin -= min_element - 1
4846
@atomic shared_histogram[bin] += 1
@@ -58,10 +56,10 @@ end
5856

5957
end
6058

61-
function histogram!(histogram_output, input)
59+
function histogram!(histogram_output, input, groupsize = 256)
6260
backend = get_backend(histogram_output)
6361
# Need static block size
64-
kernel! = histogram_kernel!(backend, (256,))
62+
kernel! = histogram_kernel!(backend, (groupsize,))
6563
kernel!(histogram_output, input, ndrange = size(input))
6664
return
6765
end
@@ -74,9 +72,10 @@ function move(backend, input)
7472
end
7573

7674
@testset "histogram tests" begin
77-
rand_input = [rand(1:128) for i in 1:1000]
78-
linear_input = [i for i in 1:1024]
79-
all_two = [2 for i in 1:512]
75+
# Use Int32 as some backends don't support 64-bit atomics
76+
rand_input = Int32.(rand(1:128, 1000))
77+
linear_input = Int32.(1:1024)
78+
all_two = fill(Int32(2), 512)
8079

8180
histogram_rand_baseline = create_histogram(rand_input)
8281
histogram_linear_baseline = create_histogram(linear_input)
@@ -86,14 +85,14 @@ end
8685
linear_input = move(backend, linear_input)
8786
all_two = move(backend, all_two)
8887

89-
rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
90-
linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
91-
two_histogram = KernelAbstractions.zeros(backend, Int, 2)
88+
rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input)))
89+
linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input)))
90+
two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two)))
9291

93-
histogram!(rand_histogram, rand_input)
92+
histogram!(rand_histogram, rand_input, 6)
9493
histogram!(linear_histogram, linear_input)
9594
histogram!(two_histogram, all_two)
96-
KernelAbstractions.synchronize(CPU())
95+
KernelAbstractions.synchronize(backend)
9796

9897
@test isapprox(Array(rand_histogram), histogram_rand_baseline)
9998
@test isapprox(Array(linear_histogram), histogram_linear_baseline)

examples/memcopy.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ function mycopy!(A, B)
1616
return
1717
end
1818

19-
A = KernelAbstractions.zeros(backend, Float64, 128, 128)
20-
B = KernelAbstractions.ones(backend, Float64, 128, 128)
19+
A = KernelAbstractions.zeros(backend, f_type, 128, 128)
20+
B = KernelAbstractions.ones(backend, f_type, 128, 128)
2121
mycopy!(A, B)
2222
KernelAbstractions.synchronize(backend)
2323
@test A == B

examples/memcopy_static.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ function mycopy_static!(A, B)
1616
return
1717
end
1818

19-
A = KernelAbstractions.zeros(backend, Float64, 128, 128)
20-
B = KernelAbstractions.ones(backend, Float64, 128, 128)
19+
A = KernelAbstractions.zeros(backend, f_type, 128, 128)
20+
B = KernelAbstractions.ones(backend, f_type, 128, 128)
2121
mycopy_static!(A, B)
2222
KernelAbstractions.synchronize(backend)
2323
@test A == B

examples/performance.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ end
2626

2727
# Local memory variants
2828

29-
@kernel function lmem_copy_kernel!(
29+
@kernel unsafe_indices = true function lmem_copy_kernel!(
3030
output, @Const(input),
3131
::Val{BANK} = Val(1),
3232
) where {BANK}
@@ -46,7 +46,7 @@ end
4646
@inbounds output[I, J] = tile[i, j]
4747
end
4848

49-
@kernel function lmem_transpose_kernel!(
49+
@kernel unsafe_indices = true function lmem_transpose_kernel!(
5050
output, @Const(input),
5151
::Val{BANK} = Val(1),
5252
) where {BANK}
@@ -77,7 +77,7 @@ end
7777

7878
# Local Memory + process multiple elements per lane
7979

80-
@kernel function coalesced_copy_kernel!(
80+
@kernel unsafe_indices = true function coalesced_copy_kernel!(
8181
output, @Const(input),
8282
::Val{BANK} = Val(1),
8383
) where {BANK}
@@ -105,7 +105,7 @@ end
105105
end
106106
end
107107

108-
@kernel function coalesced_transpose_kernel!(
108+
@kernel unsafe_indices = true function coalesced_transpose_kernel!(
109109
output, @Const(input),
110110
::Val{BANK} = Val(1),
111111
) where {BANK}

examples/performant_matmul.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ using Test
44
using Random
55
include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
66

7-
const TILE_DIM = 32
7+
# We use a TILE_DIM of 16 as a safe value since while
8+
# most backends support up to 1024 threads per group,
9+
# Metal sometimes supports fewer.
10+
const TILE_DIM = 16
811

9-
@kernel function coalesced_matmul_kernel!(
12+
@kernel unsafe_indices = true function coalesced_matmul_kernel!(
1013
output, @Const(input1), @Const(input2), N, R, M,
1114
::Val{BANK} = Val(1),
1215
) where {BANK}

0 commit comments

Comments
 (0)