Skip to content

Commit 0d1964b

Browse files
committed
fastmath demo
1 parent d373ee0 commit 0d1964b

File tree

3 files changed

+30
-8
lines changed

3 files changed

+30
-8
lines changed

src/KernelAbstractions.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ synchronize(backend)
5050
```
5151
"""
5252
macro kernel(expr)
53-
__kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
53+
__kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=force_fastmath=# false)
5454
end
5555

5656
"""
@@ -60,6 +60,7 @@ This allows for two different configurations:
6060
6161
1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
6262
2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
63+
3. `fastmath={false, true}`: Enables a forced `@fastmath` macro around the function definition. This will use less precise square roots and flush denormals.
6364
6465
- [`@context`](@ref)
6566
@@ -72,23 +73,28 @@ macro kernel(ex...)
7273
else
7374
generate_cpu = true
7475
force_inbounds = false
76+
force_fastmath = false
7577
for i in 1:(length(ex) - 1)
7678
if ex[i] isa Expr && ex[i].head == :(=) &&
7779
ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
7880
generate_cpu = ex[i].args[2]
7981
elseif ex[i] isa Expr && ex[i].head == :(=) &&
8082
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
8183
force_inbounds = ex[i].args[2]
84+
elseif ex[i] isa Expr && ex[i].head == :(=) &&
85+
ex[i].args[1] == :fastmath && ex[i].args[2] isa Bool
86+
force_fastmath = ex[i].args[2]
8287
else
8388
error(
8489
"Configuration should be of form:\n" *
8590
"* `cpu=true`\n" *
8691
"* `inbounds=false`\n" *
92+
"* `fastmath=false`\n" *
8793
"got `", ex[i], "`",
8894
)
8995
end
9096
end
91-
__kernel(ex[end], generate_cpu, force_inbounds)
97+
__kernel(ex[end], generate_cpu, force_inbounds, force_fastmath)
9298
end
9399
end
94100

src/macros.jl

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ function find_return(stmt)
1010
end
1111

1212
# XXX: Proper errors
13-
function __kernel(expr, generate_cpu = true, force_inbounds = false)
13+
function __kernel(expr, generate_cpu = true, force_inbounds = false, force_fastmath = false)
1414
def = splitdef(expr)
1515
name = def[:name]
1616
args = def[:args]
@@ -40,13 +40,13 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)
4040
if generate_cpu
4141
def_cpu = deepcopy(def)
4242
def_cpu[:name] = cpu_name
43-
transform_cpu!(def_cpu, constargs, force_inbounds)
43+
transform_cpu!(def_cpu, constargs, force_inbounds, force_fastmath)
4444
cpu_function = combinedef(def_cpu)
4545
end
4646

4747
def_gpu = deepcopy(def)
4848
def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
49-
transform_gpu!(def_gpu, constargs, force_inbounds)
49+
transform_gpu!(def_gpu, constargs, force_inbounds, force_fastmath)
5050
gpu_function = combinedef(def_gpu)
5151

5252
# create constructor functions
@@ -78,7 +78,7 @@ end
7878

7979
# The easy case, transform the function for GPU execution
8080
# - mark constant arguments by applying `constify`.
81-
function transform_gpu!(def, constargs, force_inbounds)
81+
function transform_gpu!(def, constargs, force_inbounds, force_fastmath)
8282
let_constargs = Expr[]
8383
for (i, arg) in enumerate(def[:args])
8484
if constargs[i]
@@ -92,6 +92,11 @@ function transform_gpu!(def, constargs, force_inbounds)
9292
@inbounds $(body)
9393
end
9494
end
95+
if force_fastmath
96+
body = quote
97+
@fastmath $(body)
98+
end
99+
end
95100
body = quote
96101
if $__validindex(__ctx__)
97102
$(body)
@@ -112,7 +117,7 @@ end
112117
# - handle indicies
113118
# - hoist workgroup definitions
114119
# - hoist uniform variables
115-
function transform_cpu!(def, constargs, force_inbounds)
120+
function transform_cpu!(def, constargs, force_inbounds, force_fastmath)
116121
let_constargs = Expr[]
117122
for (i, arg) in enumerate(def[:args])
118123
if constargs[i]
@@ -130,6 +135,7 @@ function transform_cpu!(def, constargs, force_inbounds)
130135
if force_inbounds
131136
push!(new_stmts, Expr(:inbounds, :pop))
132137
end
138+
133139
push!(new_stmts, Expr(:popaliasscope))
134140
push!(new_stmts, :(return nothing))
135141
def[:body] = Expr(

test/runtests.jl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ end
2222
@test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel(CPU())
2323

2424
# testing multiple configurations at the same time
25-
@kernel cpu = false inbounds = false function my_no_cpu_kernel2(a)
25+
@kernel cpu = false inbounds = false fastmath = false function my_no_cpu_kernel2(a)
2626
end
2727
@test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel2(CPU())
2828

@@ -43,6 +43,16 @@ if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 2
4343
@test nothing == my_inbounds_kernel(CPU())(Int[], ndrange = 1)
4444
end
4545

46+
if Base.JLOptions().fast_math == 0
47+
@kernel fastmath = true function my_fastmath_kernel(a)
48+
idx = @index(Global, Linear)
49+
a[idx] = sqrt(10)
50+
end
51+
A = [0.0]
52+
my_fastmath_kernel(CPU())(A, ndrange = 1)
53+
@test A[1] == @fastmath sqrt(10)
54+
end
55+
4656
struct NewBackend <: KernelAbstractions.GPU end
4757
@testset "Default host implementation" begin
4858
backend = NewBackend()

0 commit comments

Comments
 (0)