Skip to content

Commit de18f89

Browse files
authored
Merge branch 'main' into fe/return
2 parents 973240c + 4585ca9 commit de18f89

File tree

11 files changed

+202
-14
lines changed

11 files changed

+202
-14
lines changed

.github/workflows/Formatting.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
name: 'Format'
2+
3+
on:
4+
pull_request_target:
5+
paths: ['**/*.jl']
6+
types: [opened, synchronize, reopened, ready_for_review]
7+
8+
permissions:
9+
contents: read
10+
11+
# needed for julia-actions/cache to delete old caches
12+
actions: write
13+
14+
# needed for googleapis/code-suggester
15+
pull-requests: write
16+
17+
jobs:
18+
runic:
19+
runs-on: ubuntu-latest
20+
if: github.event.pull_request.draft == false
21+
steps:
22+
- uses: actions/checkout@v4
23+
with:
24+
ref: ${{github.event.pull_request.head.ref}}
25+
repository: ${{github.event.pull_request.head.repo.full_name}}
26+
fetch-depth: 0
27+
28+
- name: Setup Julia
29+
uses: julia-actions/setup-julia@v2
30+
with:
31+
version: '1'
32+
arch: 'x64'
33+
- uses: julia-actions/cache@v2
34+
35+
- name: Install Runic
36+
run: |
37+
julia --project=@runic -e 'using Pkg; Pkg.add("Runic")'
38+
curl -o git-runic https://raw.githubusercontent.com/fredrikekre/Runic.jl/master/bin/git-runic
39+
chmod +x git-runic
40+
sudo mv git-runic /usr/local/bin
41+
42+
- name: Run Runic
43+
run: |
44+
set +e
45+
git runic origin/main
46+
[ $? -eq 2 ] && exit 1 || exit 0
47+
48+
- name: Suggest changes
49+
uses: googleapis/code-suggester@v2
50+
env:
51+
ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
52+
with:
53+
command: review
54+
pull_number: ${{ github.event.pull_request.number }}
55+
git_dir: '.'

Project.toml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "KernelAbstractions"
22
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
33
authors = ["Valentin Churavy <[email protected]> and contributors"]
4-
version = "0.9.29"
4+
version = "0.9.32"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -15,12 +15,10 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1515
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
1616
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
1717
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
18-
UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
19-
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
2018

2119
[compat]
2220
Adapt = "0.4, 1.0, 2.0, 3.0, 4"
23-
Atomix = "0.1"
21+
Atomix = "0.1, 1"
2422
EnzymeCore = "0.7, 0.8.1"
2523
InteractiveUtils = "1.6"
2624
LinearAlgebra = "1.6"
@@ -30,8 +28,6 @@ Requires = "1.3"
3028
SparseArrays = "<0.0.1, 1.6"
3129
StaticArrays = "0.12, 1.0"
3230
UUIDs = "<0.0.1, 1.6"
33-
UnsafeAtomics = "0.2.1"
34-
UnsafeAtomicsLLVM = "0.1, 0.2"
3531
julia = "1.6"
3632

3733
[extensions]

docs/src/quickstart.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ The [`synchronize`](@ref) blocks the *host* until the kernel has completed on th
4545

4646
## Launching kernel on the backend
4747

48-
To launch the kernel on a backend-supported backend `isa(backend, KA.GPU)` (e.g., `CUDABackend()`, `ROCBackend()`, `oneBackend()`), we generate the kernel
48+
To launch the kernel on a backend-supported backend `isa(backend, KA.GPU)` (e.g., `CUDABackend()`, `ROCBackend()`, `oneAPIBackend()`), we generate the kernel
4949
for this backend.
5050

5151
First, we initialize the array using the Array constructor of the chosen backend with

src/KernelAbstractions.jl

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ export synchronize, get_backend, allocate
1010
import PrecompileTools
1111

1212
import Atomix: @atomic, @atomicswap, @atomicreplace
13-
import UnsafeAtomics
1413

1514
using MacroTools
1615
using StaticArrays
@@ -111,13 +110,53 @@ macro Const end
111110
"""
112111
copyto!(::Backend, dest::AbstractArray, src::AbstractArray)
113112
114-
Perform a `copyto!` operation that execution ordered with respect to the backend.
113+
Perform an asynchronous `copyto!` operation that is execution ordered with respect to the back-end.
114+
115+
For most users, `Base.copyto!` should suffice, performance a simple, synchronous copy.
116+
Only when you know you need asynchronicity w.r.t. the host, you should consider using
117+
this asynchronous version, which requires additional lifetime guarantees as documented below.
118+
119+
!!! warning
120+
121+
Because of the asynchronous nature of this operation, the user is required to guarantee that the lifetime
122+
of the source extends past the *completion* of the copy operation as to avoid a use-after-free. It is not
123+
sufficient to simply use `GC.@preserve` around the call to `copyto!`, because that only extends the
124+
lifetime past the operation getting queued. Instead, it may be required to `synchronize()`,
125+
or otherwise guarantee that the source will still be around when the copy is executed:
126+
127+
```julia
128+
arr = zeros(64)
129+
GC.@preserve arr begin
130+
copyto!(backend, arr, ...)
131+
# other operations
132+
synchronize(backend)
133+
end
134+
```
115135
116136
!!! note
117-
Backend implementations **must** implement this function.
137+
138+
On some back-ends it may be necessary to first call [`pagelock!`](@ref) on host memory
139+
to enable fully asynchronous behavior w.r.t to the host.
140+
141+
!!! note
142+
Backends **must** implement this function.
118143
"""
119144
function copyto! end
120145

146+
"""
147+
pagelock!(::Backend, dest::AbstractArray)
148+
149+
Pagelock (pin) a host memory buffer for a backend device. This may be necessary for [`copyto!`](@ref)
150+
to perform asynchronously w.r.t to the host/
151+
152+
This function should return `nothing`; or `missing` if not implemented.
153+
154+
155+
!!! note
156+
Backends **may** implement this function.
157+
"""
158+
function pagelock! end
159+
121160
"""
122161
synchronize(::Backend)
123162
@@ -547,6 +586,34 @@ function priority!(::Backend, prio::Symbol)
547586
return nothing
548587
end
549588

589+
"""
590+
device(::Backend)::Int
591+
592+
Returns the ordinal number of the currently active device starting at one.
593+
"""
594+
function device(::Backend)
595+
return 1
596+
end
597+
598+
"""
599+
ndevices(::Backend)::Int
600+
601+
Returns the number of devices the backend supports.
602+
"""
603+
function ndevices(::Backend)
604+
return 1
605+
end
606+
607+
"""
608+
device!(::Backend, id::Int)
609+
"""
610+
function device!(backend::Backend, id::Int)
611+
if !(0 < id <= ndevices(backend))
612+
throw(ArgumentError("Device id $id out of bounds."))
613+
end
614+
return nothing
615+
end
616+
550617
"""
551618
functional(::Backend)
552619
@@ -563,6 +630,10 @@ function functional(::Backend)
563630
return missing
564631
end
565632

633+
function pagelock!(::Backend, x)
634+
return missing
635+
end
636+
566637
include("nditeration.jl")
567638
using .NDIteration
568639
import .NDIteration: get

src/compiler.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ end
3030

3131
@inline groupsize(ctx) = __groupsize(ctx)
3232
@inline ndrange(ctx) = __ndrange(ctx)
33+
@inline Base.ndims(ctx) = ndims(__iterspace(ctx))

src/cpu.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import UnsafeAtomicsLLVM
2-
31
unsafe_free!(::AbstractArray) = return
42
synchronize(::CPU) = nothing
53

@@ -35,6 +33,7 @@ function copyto!(backend::CPU, A, B)
3533
end
3634

3735
functional(::CPU) = true
36+
pagelock!(::CPU, x) = nothing
3837

3938
function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
4039
ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
@@ -43,7 +42,8 @@ function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
4342
return nothing
4443
end
4544

46-
return __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
45+
__run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
46+
return nothing
4747
end
4848

4949
const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size

src/nditeration.jl

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ end
6363
@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: StaticSize} = CartesianIndices(get(W))::CartesianIndices{N}
6464
@inline blocks(range::NDRange{N, B}) where {N, B <: DynamicSize} = range.blocks::CartesianIndices{N}
6565
@inline blocks(range::NDRange{N, B}) where {N, B <: StaticSize} = CartesianIndices(get(B))::CartesianIndices{N}
66+
@inline Base.ndims(::NDRange{N}) where {N} = N
6667

6768
import Base.iterate
6869
@inline iterate(range::NDRange) = iterate(blocks(range))
@@ -80,8 +81,44 @@ Base.length(range::NDRange) = length(blocks(range))
8081
return CartesianIndex(nI)
8182
end
8283

84+
85+
"""
86+
assume(cond::Bool)
87+
88+
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
89+
it to optimize more aggressively.
90+
"""
91+
@inline assume(cond::Bool) = Base.llvmcall(
92+
(
93+
"""
94+
declare void @llvm.assume(i1)
95+
96+
define void @entry(i8) #0 {
97+
%cond = icmp eq i8 %0, 1
98+
call void @llvm.assume(i1 %cond)
99+
ret void
100+
}
101+
102+
attributes #0 = { alwaysinline }""", "entry",
103+
),
104+
Nothing, Tuple{Bool}, cond
105+
)
106+
107+
@inline function assume_nonzero(CI::CartesianIndices)
108+
return ntuple(Val(ndims(CI))) do I
109+
Base.@_inline_meta
110+
indices = CI.indices[I]
111+
assume(indices.stop > 0)
112+
end
113+
end
114+
83115
Base.@propagate_inbounds function expand(ndrange::NDRange, groupidx::Integer, idx::Integer)
84-
return expand(ndrange, blocks(ndrange)[groupidx], workitems(ndrange)[idx])
116+
# this causes a exception branch and a div
117+
B = blocks(ndrange)
118+
W = workitems(ndrange)
119+
assume_nonzero(B)
120+
assume_nonzero(W)
121+
return expand(ndrange, B[groupidx], workitems(ndrange)[idx])
85122
end
86123

87124
Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where {N}

test/compiler.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ function compiler_testsuite(backend, ArrayT)
4141
kernel = index(CPU(), DynamicSize(), DynamicSize())
4242
iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}()
4343
ctx = KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(KernelAbstractions.NoDynamicCheck()))
44+
@test ndims(ctx) == 1
4445
@test KernelAbstractions.__index_Global_NTuple(ctx, CartesianIndex(1)) == (1,)
4546

4647
A = ArrayT{Int}(undef, 1)

test/devices.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
function devices_testsuite(Backend)
2+
backend = Backend()
3+
4+
current_device = KernelAbstractions.device(backend)
5+
for i in KernelAbstractions.ndevices(backend)
6+
KernelAbstractions.device!(backend, i)
7+
@test KernelAbstractions.device(backend) == i
8+
end
9+
10+
@test_throws ArgumentError KernelAbstractions.device!(backend, 0)
11+
@test_throws ArgumentError KernelAbstractions.device!(backend, KernelAbstractions.ndevices(backend) + 1)
12+
KernelAbstractions.device!(backend, current_device)
13+
return nothing
14+
end

test/nditeration.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@ function nditeration_testsuite()
77
let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32)))
88
@test length(ndrange) == 256 * 256
99
@test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256))))
10+
@test ndims(ndrange) == 2
1011
end
1112
let ndrange = NDRange{2, StaticSize{(256, 256)}, DynamicSize}(nothing, CartesianIndices((32, 32)))
1213
@test length(ndrange) == 256 * 256
1314
@test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256))))
15+
@test ndims(ndrange) == 2
1416
end
1517
end
1618

@@ -40,13 +42,15 @@ function nditeration_testsuite()
4042
I = Tuple(I)
4143
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
4244
end
45+
@test ndims(ndrange) == 2
4346
end
4447
let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4)), CartesianIndices((Dim_x, Dim_y)))
4548
idx = linear_iteration(ndrange)
4649
for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
4750
I = Tuple(I)
4851
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
4952
end
53+
@test ndims(ndrange) == 2
5054
end
5155

5256
Dim_x = 32
@@ -58,13 +62,15 @@ function nditeration_testsuite()
5862
I = Tuple(I)
5963
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
6064
end
65+
@test ndims(ndrange) == 2
6166
end
6267
let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4 * 32)), CartesianIndices((Dim_x, Dim_y)))
6368
idx = linear_iteration(ndrange)
6469
for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
6570
I = Tuple(I)
6671
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
6772
end
73+
@test ndims(ndrange) == 2
6874
end
6975

7076
Dim_x = 1
@@ -76,13 +82,15 @@ function nditeration_testsuite()
7682
I = Tuple(I)
7783
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
7884
end
85+
@test ndims(ndrange) == 2
7986
end
8087
let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4 * 32, 4)), CartesianIndices((Dim_x, Dim_y)))
8188
idx = linear_iteration(ndrange)
8289
for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
8390
I = Tuple(I)
8491
@test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
8592
end
93+
@test ndims(ndrange) == 2
8694
end
8795
end
8896
return

0 commit comments

Comments
 (0)