JuliaGPU
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/api/kernel.md‎
Lines changed: 8 additions & 8 deletions b/‎docs/src/api/kernel.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/src/profiling.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/src/profiling.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/src/usage/kernel.md‎
Lines changed: 14 additions & 9 deletions b/‎docs/src/usage/kernel.md‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎examples/gtk.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/gtk.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/peakflops.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/peakflops.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/unified_memory.jl‎
Lines changed: 2 additions & 2 deletions b/‎examples/unified_memory.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/vadd.jl‎
Lines changed: 1 addition & 1 deletion b/‎examples/vadd.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎perf/byval.jl‎
Lines changed: 3 additions & 3 deletions b/‎perf/byval.jl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎perf/kernel.jl‎
Lines changed: 3 additions & 3 deletions b/‎perf/kernel.jl‎
Lines changed: 3 additions & 3 deletions
@@ -111,7 +111,7 @@ are supported, etc):
 
 ```julia-repl
 julia> function vadd(a, b, c)
-           i = thread_position_in_grid_1d()
+           i = thread_position_in_grid().x
            c[i] = a[i] + b[i]
            return
        end
 
@@ -13,19 +13,19 @@ This is made possible by interfacing with the Metal libraries by wrapping a subs
 thread_index_in_quadgroup
 thread_index_in_simdgroup
 thread_index_in_threadgroup
-thread_position_in_grid_1d
-thread_position_in_threadgroup_1d
-threadgroup_position_in_grid_1d
-threadgroups_per_grid_1d
-threads_per_grid_1d
+thread_position_in_grid
+thread_position_in_threadgroup
+threadgroup_position_in_grid
+threadgroups_per_grid
+threads_per_grid
 threads_per_simdgroup
-threads_per_threadgroup_1d
+threads_per_threadgroup
 simdgroups_per_threadgroup
 simdgroup_index_in_threadgroup
 quadgroup_index_in_threadgroup
 quadgroups_per_threadgroup
-grid_size_1d
-grid_origin_1d
+grid_size
+grid_origin
 thread_execution_width
 ```
 
 
@@ -34,7 +34,7 @@ The resulting trace can be opened with the Instruments app, part of Xcode.
 julia> using Metal
 
 julia> function vadd(a, b, c)
-           i = thread_position_in_grid_1d()
+           i = thread_position_in_grid().x
            c[i] = a[i] + b[i]
            return
        end
@@ -78,7 +78,7 @@ $ METAL_CAPTURE_ENABLED=1 julia
 julia> using Metal
 
 julia> function vadd(a, b, c)
-           i = thread_position_in_grid_1d()
+           i = thread_position_in_grid().x
            c[i] = a[i] + b[i]
            return
        end
 
@@ -43,21 +43,26 @@ also query what the grid and threadgroup sizes are as well.
 For Metal.jl, these values are accessed via the following functions:
 
 - `thread_index_in_threadgroup()`
-- `grid_size_Xd()`
-- `thread_position_in_grid_Xd()`
-- `thread_position_in_threadgroup_Xd()`
-- `threadgroup_position_in_grid_Xd()`
-- `threadgroups_per_grid_Xd()`
-- `threads_per_grid_Xd()`
-- `threads_per_threadgroup_Xd()`
+- `grid_size()`
+- `thread_position_in_grid()`
+- `thread_position_in_threadgroup()`
+- `threadgroup_position_in_grid()`
+- `threadgroups_per_grid()`
+- `threads_per_grid()`
+- `threads_per_threadgroup()`
 
-*Where 'X' is 1, 2, or 3 according to the number of dimensions requested.*
+!!! note
+    Prior to Metal.jl v1.9, the aforementioned indexing intrinsics had a `_Xd` suffix,
+    where 'X' was 1, 2, or 3 according to the number of dimensions requested.
+
+    These methods are deprecated and it is now recommended to use the version without a suffix,
+    which behaves like the `_3d` version.
 
 Using these in a kernel (taken directly from the [vadd example](https://github.com/JuliaGPU/Metal.jl/blob/main/examples/vadd.jl)):
 
 ```julia
 function vadd(a, b, c)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     c[i] = a[i] + b[i]
     return
 end
 
@@ -19,7 +19,7 @@ end
 
 function generate(img, pos)
     r, c = Int32.(size(img))
-    i,j = thread_position_in_grid_2d()
+    i,j,_ = thread_position_in_grid()
     @inbounds if i <= r && j <= c
         img[i,j] = pos < j < pos + 10 ? colorant"red" : colorant"thistle"
     end
 
@@ -2,7 +2,7 @@ using Metal
 using BenchmarkTools
 
 function kernel_fma(a, b, c, out)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     a_val = a[i]
     b_val = b[i]
     c_val = c[i]
 
@@ -15,7 +15,7 @@ using LinearAlgebra
 # This document is meant to showcase potential use cases allowed by unified memory.
 
 function simple_kernel(arr)
-    idx = thread_position_in_grid_1d()
+    idx = thread_position_in_grid().x
     arr[idx] = cos(arr[idx])
     return
 end
@@ -60,7 +60,7 @@ svd(arr_cpu)
 # TODO: Come up with simultaneous launch of GPU and CPU work that results in undesired behavior
 
 function long_kernel(arr, dummy)
-    idx = thread_position_in_grid_1d()
+    idx = thread_position_in_grid().x
     for i in 1:100000
         dummy[1] += Float32(0.3)
     end
 
@@ -2,7 +2,7 @@ using Test
 using Metal
 
 function vadd(a, b, c)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     c[i] = a[i] + b[i]
     return
 end
 
@@ -6,7 +6,7 @@ const threads = 256
 
 # simple add matrixes kernel
 function kernel_add_mat(n, x1, x2, y)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
@@ -19,8 +19,8 @@ end
 
 # add arrays of matrixes kernel
 function kernel_add_mat_z_slices(n, vararg...)
-    x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
-    i = thread_position_in_grid_1d()
+    x1, x2, y = get_inputs3(threadgroup_position_in_grid().y, vararg...)
+    i = thread_position_in_grid().x
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
 
@@ -13,22 +13,22 @@ group["launch"] = @benchmarkable @metal identity(nothing)
 src = Metal.rand(Float32, 512, 1000)
 dest = similar(src)
 function indexing_kernel(dest, src)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     @inbounds dest[i] = src[i]
     return
 end
 group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
 
 function checked_indexing_kernel(dest, src)
-    i = thread_position_in_grid_1d()
+    i = thread_position_in_grid().x
     dest[i] = src[i]
     return
 end
 group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
 
 ## DELETE
 # function rand_kernel(dest::AbstractArray{T}) where {T}
-#     i = thread_position_in_grid_1d()
+#     i = thread_position_in_grid().x
 #     dest[i] = Metal.rand(T)
 #     return
 # end