Skip to content

Commit 38ba567

Browse files
Always use 3d indexing intrinsics (#672)
Co-authored-by: Tim Besard <[email protected]>
1 parent e49045c commit 38ba567

26 files changed

+169
-172
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ are supported, etc):
111111

112112
```julia-repl
113113
julia> function vadd(a, b, c)
114-
i = thread_position_in_grid_1d()
114+
i = thread_position_in_grid().x
115115
c[i] = a[i] + b[i]
116116
return
117117
end

docs/src/api/kernel.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,19 @@ This is made possible by interfacing with the Metal libraries by wrapping a subs
1313
thread_index_in_quadgroup
1414
thread_index_in_simdgroup
1515
thread_index_in_threadgroup
16-
thread_position_in_grid_1d
17-
thread_position_in_threadgroup_1d
18-
threadgroup_position_in_grid_1d
19-
threadgroups_per_grid_1d
20-
threads_per_grid_1d
16+
thread_position_in_grid
17+
thread_position_in_threadgroup
18+
threadgroup_position_in_grid
19+
threadgroups_per_grid
20+
threads_per_grid
2121
threads_per_simdgroup
22-
threads_per_threadgroup_1d
22+
threads_per_threadgroup
2323
simdgroups_per_threadgroup
2424
simdgroup_index_in_threadgroup
2525
quadgroup_index_in_threadgroup
2626
quadgroups_per_threadgroup
27-
grid_size_1d
28-
grid_origin_1d
27+
grid_size
28+
grid_origin
2929
thread_execution_width
3030
```
3131

docs/src/profiling.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ The resulting trace can be opened with the Instruments app, part of Xcode.
3434
julia> using Metal
3535

3636
julia> function vadd(a, b, c)
37-
i = thread_position_in_grid_1d()
37+
i = thread_position_in_grid().x
3838
c[i] = a[i] + b[i]
3939
return
4040
end
@@ -78,7 +78,7 @@ $ METAL_CAPTURE_ENABLED=1 julia
7878
julia> using Metal
7979

8080
julia> function vadd(a, b, c)
81-
i = thread_position_in_grid_1d()
81+
i = thread_position_in_grid().x
8282
c[i] = a[i] + b[i]
8383
return
8484
end

docs/src/usage/kernel.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,26 @@ also query what the grid and threadgroup sizes are as well.
4343
For Metal.jl, these values are accessed via the following functions:
4444

4545
- `thread_index_in_threadgroup()`
46-
- `grid_size_Xd()`
47-
- `thread_position_in_grid_Xd()`
48-
- `thread_position_in_threadgroup_Xd()`
49-
- `threadgroup_position_in_grid_Xd()`
50-
- `threadgroups_per_grid_Xd()`
51-
- `threads_per_grid_Xd()`
52-
- `threads_per_threadgroup_Xd()`
46+
- `grid_size()`
47+
- `thread_position_in_grid()`
48+
- `thread_position_in_threadgroup()`
49+
- `threadgroup_position_in_grid()`
50+
- `threadgroups_per_grid()`
51+
- `threads_per_grid()`
52+
- `threads_per_threadgroup()`
5353

54-
*Where 'X' is 1, 2, or 3 according to the number of dimensions requested.*
54+
!!! note
55+
Prior to Metal.jl v1.9, the aforementioned indexing intrinsics had a `_Xd` suffix,
56+
where 'X' was 1, 2, or 3 according to the number of dimensions requested.
57+
58+
These methods are deprecated and it is now recommended to use the version without a suffix,
59+
which behaves like the `_3d` version.
5560

5661
Using these in a kernel (taken directly from the [vadd example](https://github.com/JuliaGPU/Metal.jl/blob/main/examples/vadd.jl)):
5762

5863
```julia
5964
function vadd(a, b, c)
60-
i = thread_position_in_grid_1d()
65+
i = thread_position_in_grid().x
6166
c[i] = a[i] + b[i]
6267
return
6368
end

examples/gtk.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ end
1919

2020
function generate(img, pos)
2121
r, c = Int32.(size(img))
22-
i,j = thread_position_in_grid_2d()
22+
i,j,_ = thread_position_in_grid()
2323
@inbounds if i <= r && j <= c
2424
img[i,j] = pos < j < pos + 10 ? colorant"red" : colorant"thistle"
2525
end

examples/peakflops.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ using Metal
22
using BenchmarkTools
33

44
function kernel_fma(a, b, c, out)
5-
i = thread_position_in_grid_1d()
5+
i = thread_position_in_grid().x
66
a_val = a[i]
77
b_val = b[i]
88
c_val = c[i]

examples/unified_memory.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ using LinearAlgebra
1515
# This document is meant to showcase potential use cases allowed by unified memory.
1616

1717
function simple_kernel(arr)
18-
idx = thread_position_in_grid_1d()
18+
idx = thread_position_in_grid().x
1919
arr[idx] = cos(arr[idx])
2020
return
2121
end
@@ -60,7 +60,7 @@ svd(arr_cpu)
6060
# TODO: Come up with simultaneous launch of GPU and CPU work that results in undesired behavior
6161

6262
function long_kernel(arr, dummy)
63-
idx = thread_position_in_grid_1d()
63+
idx = thread_position_in_grid().x
6464
for i in 1:100000
6565
dummy[1] += Float32(0.3)
6666
end

examples/vadd.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ using Test
22
using Metal
33

44
function vadd(a, b, c)
5-
i = thread_position_in_grid_1d()
5+
i = thread_position_in_grid().x
66
c[i] = a[i] + b[i]
77
return
88
end

perf/byval.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const threads = 256
66

77
# simple add matrixes kernel
88
function kernel_add_mat(n, x1, x2, y)
9-
i = thread_position_in_grid_1d()
9+
i = thread_position_in_grid().x
1010
if i <= n
1111
@inbounds y[i] = x1[i] + x2[i]
1212
end
@@ -19,8 +19,8 @@ end
1919

2020
# add arrays of matrixes kernel
2121
function kernel_add_mat_z_slices(n, vararg...)
22-
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
23-
i = thread_position_in_grid_1d()
22+
x1, x2, y = get_inputs3(threadgroup_position_in_grid().y, vararg...)
23+
i = thread_position_in_grid().x
2424
if i <= n
2525
@inbounds y[i] = x1[i] + x2[i]
2626
end

perf/kernel.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,22 @@ group["launch"] = @benchmarkable @metal identity(nothing)
1313
src = Metal.rand(Float32, 512, 1000)
1414
dest = similar(src)
1515
function indexing_kernel(dest, src)
16-
i = thread_position_in_grid_1d()
16+
i = thread_position_in_grid().x
1717
@inbounds dest[i] = src[i]
1818
return
1919
end
2020
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
2121

2222
function checked_indexing_kernel(dest, src)
23-
i = thread_position_in_grid_1d()
23+
i = thread_position_in_grid().x
2424
dest[i] = src[i]
2525
return
2626
end
2727
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
2828

2929
## DELETE
3030
# function rand_kernel(dest::AbstractArray{T}) where {T}
31-
# i = thread_position_in_grid_1d()
31+
# i = thread_position_in_grid().x
3232
# dest[i] = Metal.rand(T)
3333
# return
3434
# end

0 commit comments

Comments
 (0)