Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/src/design.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
- `ldg` on the GPU
- `@aliasscopes` on the CPU

- Cartesian or Linear indicies supported
- Cartesian or Linear indices supported
- `@index(Linear)
- `@index(Cartesian)
- `@synchronize` for inserting workgroup-level synchronization
Expand Down
4 changes: 2 additions & 2 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ The `CPU` backend always had this limitation and upon investigation the CUDA bac
but allows for a wider set of valid kernels.

This highlighted a design flaw in KernelAbstractions. Most GPU implementations execute KernelAbstraction workgroups on static blocks
This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indicies,
This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indices,
KernelAbstraction would insert a dynamic boundscheck.

Prior to v0.9.34 a kernel like
Expand Down Expand Up @@ -118,7 +118,7 @@ Since this transformation can be disruptive, user can now opt out of the implici
but users must avoid the use of `@index(Global)` and instead use their own derivation based on `@index(Group)` and `@index(Local)`.

```julia
@kernel unsafe_indicies=true function localmem(A)
@kernel unsafe_indices=true function localmem(A)
N = @uniform prod(@groupsize())
gI = @index(Group, Linear)
i = @index(Local, Linear)
Expand Down
14 changes: 7 additions & 7 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ synchronize(backend)
```
"""
macro kernel(expr)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indicies=# false)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indices=# false)
end

"""
Expand All @@ -60,7 +60,7 @@ This allows for two different configurations:

1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
3. `unsafe_indicies={false, true}`: Disables the implicit validation of indicies, users must avoid `@index(Global)`.
3. `unsafe_indices={false, true}`: Disables the implicit validation of indices, users must avoid `@index(Global)`.

- [`@context`](@ref)

Expand All @@ -72,7 +72,7 @@ macro kernel(ex...)
return __kernel(ex[1], true, false, false)
else
generate_cpu = true
unsafe_indicies = false
unsafe_indices = false
force_inbounds = false
for i in 1:(length(ex) - 1)
if ex[i] isa Expr && ex[i].head == :(=) &&
Expand All @@ -82,19 +82,19 @@ macro kernel(ex...)
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
force_inbounds = ex[i].args[2]
elseif ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :unsafe_indicies && ex[i].args[2] isa Bool
unsafe_indicies = ex[i].args[2]
ex[i].args[1] == :unsafe_indices && ex[i].args[2] isa Bool
unsafe_indices = ex[i].args[2]
else
error(
"Configuration should be of form:\n" *
"* `cpu=false`\n" *
"* `inbounds=true`\n" *
"* `unsafe_indicies=true`\n" *
"* `unsafe_indices=true`\n" *
"got `", ex[i], "`",
)
end
end
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indicies)
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indices)
end
end

Expand Down
30 changes: 15 additions & 15 deletions src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function find_return(stmt)
end

# XXX: Proper errors
function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indicies = false)
function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indices = false)
def = splitdef(expr)
name = def[:name]
args = def[:args]
Expand Down Expand Up @@ -46,7 +46,7 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indi

def_gpu = deepcopy(def)
def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indicies)
transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indices)
gpu_function = combinedef(def_gpu)

# create constructor functions
Expand Down Expand Up @@ -78,7 +78,7 @@ end

# The easy case, transform the function for GPU execution
# - mark constant arguments by applying `constify`.
function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies)
function transform_gpu!(def, constargs, force_inbounds, unsafe_indices)
let_constargs = Expr[]
for (i, arg) in enumerate(def[:args])
if constargs[i]
Expand All @@ -89,13 +89,13 @@ function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies)
new_stmts = Expr[]
body = MacroTools.flatten(def[:body])
push!(new_stmts, Expr(:aliasscope))
if !unsafe_indicies
if !unsafe_indices
push!(new_stmts, :(__active_lane__ = $__validindex(__ctx__)))
end
if force_inbounds
push!(new_stmts, Expr(:inbounds, true))
end
if !unsafe_indicies
if !unsafe_indices
append!(new_stmts, split(emit_gpu, body.args))
else
push!(new_stmts, body)
Expand All @@ -117,7 +117,7 @@ end
# - mark constant arguments by applying `constify`.
# - insert aliasscope markers
# - insert implied loop bodys
# - handle indicies
# - handle indices
# - hoist workgroup definitions
# - hoist uniform variables
function transform_cpu!(def, constargs, force_inbounds)
Expand Down Expand Up @@ -149,7 +149,7 @@ function transform_cpu!(def, constargs, force_inbounds)
end

struct WorkgroupLoop
indicies::Vector{Any}
indices::Vector{Any}
stmts::Vector{Any}
allocations::Vector{Any}
private_allocations::Vector{Any}
Expand Down Expand Up @@ -177,7 +177,7 @@ end
function split(
emit,
stmts,
indicies = Any[], private = Set{Symbol}(),
indices = Any[], private = Set{Symbol}(),
)
# 1. Split the code into blocks separated by `@synchronize`
# 2. Aggregate `@index` expressions
Expand All @@ -191,7 +191,7 @@ function split(
for stmt in stmts
has_sync = find_sync(stmt)
if has_sync
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), is_sync(stmt))
loop = WorkgroupLoop(deepcopy(indices), current, allocations, private_allocations, deepcopy(private), is_sync(stmt))
push!(new_stmts, emit(loop))
allocations = Any[]
private_allocations = Any[]
Expand All @@ -206,7 +206,7 @@ function split(
function recurse(expr::Expr)
expr = unblock(expr)
if is_scope_construct(expr) && any(find_sync, expr.args)
new_args = unblock(split(emit, expr.args, deepcopy(indicies), deepcopy(private)))
new_args = unblock(split(emit, expr.args, deepcopy(indices), deepcopy(private)))
return Expr(expr.head, new_args...)
else
return Expr(expr.head, map(recurse, expr.args)...)
Expand All @@ -225,7 +225,7 @@ function split(
continue
elseif @capture(stmt, lhs_ = rhs_ | (vs__, lhs_ = rhs_))
if @capture(rhs, @index(args__))
push!(indicies, stmt)
push!(indices, stmt)
continue
elseif @capture(rhs, @localmem(args__) | @uniform(args__))
push!(allocations, stmt)
Expand All @@ -249,15 +249,15 @@ function split(

# everything since the last `@synchronize`
if !isempty(current)
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), false)
loop = WorkgroupLoop(deepcopy(indices), current, allocations, private_allocations, deepcopy(private), false)
push!(new_stmts, emit(loop))
end
return new_stmts
end

function emit_cpu(loop)
idx = gensym(:I)
for stmt in loop.indicies
for stmt in loop.indices
# splice index into the i = @index(Cartesian, $idx)
@assert stmt.head === :(=)
rhs = stmt.args[2]
Expand Down Expand Up @@ -300,7 +300,7 @@ function emit_cpu(loop)
loopexpr = quote
for $idx in $__workitems_iterspace(__ctx__)
$__validindex(__ctx__, $idx) || continue
$(loop.indicies...)
$(loop.indices...)
$(unblock(body))
end
end
Expand All @@ -318,7 +318,7 @@ function emit_gpu(loop)
$(loop.allocations...)
$(loop.private_allocations...)
if __active_lane__
$(loop.indicies...)
$(loop.indices...)
$(unblock(body))
end
end
Expand Down
4 changes: 2 additions & 2 deletions test/localmem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ end
end
end

@kernel unsafe_indicies = true function localmem_unsafe_indicies(A)
@kernel unsafe_indices = true function localmem_unsafe_indices(A)
N = @uniform prod(@groupsize())
gI = @index(Group, Linear)
i = @index(Local, Linear)
Expand All @@ -49,7 +49,7 @@ end

function localmem_testsuite(backend, ArrayT)
@testset "kernels" begin
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indicies(backend(), 16))
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16))
A = ArrayT{Int}(undef, 64)
kernel!(A, ndrange = size(A))
synchronize(backend())
Expand Down
Loading