@@ -33,35 +33,47 @@ host to influence how the kernel is executed. The following keyword arguments ar
33
33
no other keyword arguments that influence the launch configuration are specified.
34
34
- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
35
35
launched. This cannot be used in combination with the `total_threads` argument.
36
+ - `name::String`: inform the back end about the name of the kernel to be executed.
37
+ This can be used to emit better diagnostics, and is useful with anonymous kernels.
36
38
"""
37
39
function gpu_call (kernel:: Base.Callable , args... ;
38
40
target:: AbstractArray = first (args),
39
41
total_threads:: Union{Int,Nothing} = nothing ,
40
42
threads:: Union{Int,Nothing} = nothing ,
41
43
blocks:: Union{Int,Nothing} = nothing ,
42
- kwargs ... )
43
- # determine how many threads/blocks to launch
44
+ name :: Union{String,Nothing} = nothing )
45
+ # non-trivial default values for launch configuration
44
46
if total_threads=== nothing && threads=== nothing && blocks=== nothing
45
47
total_threads = length (target)
46
- end
47
- if total_threads != = nothing
48
- if threads != = nothing || blocks != = nothing
49
- error (" Cannot specify both total_threads and threads/blocks configuration" )
50
- end
51
- blocks, threads = thread_blocks_heuristic (total_threads)
52
- else
48
+ elseif total_threads=== nothing
53
49
if threads === nothing
54
50
threads = 1
55
51
end
56
52
if blocks === nothing
57
53
blocks = 1
58
54
end
55
+ elseif threads!= = nothing || blocks!= = nothing
56
+ error (" Cannot specify both total_threads and threads/blocks configuration" )
57
+ end
58
+
59
+ if total_threads != = nothing
60
+ gpu_call (backend (target), kernel, args, total_threads; name= name)
61
+ else
62
+ gpu_call (backend (target), kernel, args, threads, blocks; name= name)
59
63
end
64
+ end
60
65
61
- gpu_call (backend (target), kernel, args... ; threads= threads, blocks= blocks, kwargs... )
66
+ # gpu_call method with a simple launch configuration heuristic.
67
+ # this can be specialised if more sophisticated heuristics are available.
68
+ function gpu_call (backend:: AbstractGPUBackend , kernel, args, total_threads:: Int ; kwargs... )
69
+ threads = clamp (total_threads, 1 , 256 )
70
+ blocks = max (ceil (Int, total_threads / threads), 1 )
71
+
72
+ gpu_call (backend, kernel, args, threads, blocks; kwargs... )
62
73
end
63
74
64
- gpu_call (backend:: AbstractGPUBackend , kernel, args... ; kwargs... ) = error (" Not implemented" ) # COV_EXCL_LINE
75
+ # bottom-line gpu_call method that is expected to be implemented by the back end
76
+ gpu_call (backend:: AbstractGPUBackend , kernel, args, threads:: Int , blocks:: Int ; kwargs... ) = error (" Not implemented" ) # COV_EXCL_LINE
65
77
66
78
"""
67
79
synchronize(A::AbstractArray)
@@ -72,10 +84,3 @@ function synchronize(A::AbstractArray)
72
84
# fallback is a noop, for backends not needing synchronization. This
73
85
# makes it easier to write generic code that also works for AbstractArrays
74
86
end
75
-
76
- function thread_blocks_heuristic (len:: Integer )
77
- # TODO better threads default
78
- threads = clamp (len, 1 , 256 )
79
- blocks = max (ceil (Int, len / threads), 1 )
80
- (blocks, threads)
81
- end
0 commit comments