diff --git a/docs/src/api.md b/docs/src/api.md index 9373d231a..4e107075b 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -21,6 +21,7 @@ allocate ```@docs KernelAbstractions.zeros +KernelAbstractions.supports_unified ``` ## Internal diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 15757e3a2..64e213a5b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -524,47 +524,74 @@ end # adapt_storage(::Backend, a::BackendArray) = a """ - allocate(::Backend, Type, dims...)::AbstractArray + allocate(::Backend, Type, dims...; unified=false)::AbstractArray -Allocate a storage array appropriate for the computational backend. +Allocate a storage array appropriate for the computational backend. `unified=true` +allocates an array using unified memory if the backend supports it and throws otherwise. +Use [`supports_unified`](@ref) to determine whether it is supported by a backend. !!! note Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)` -""" -allocate(backend::Backend, T::Type, dims...) = allocate(backend, T, dims) -allocate(backend::Backend, T::Type, dims::Tuple) = throw(MethodError(allocate, (backend, T, dims))) + Backend implementations **should** implement `allocate(::NewBackend, T, dims::Tuple; unified::Bool=false)` +""" +allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...) +function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing) + if isnothing(unified) + throw(MethodError(allocate, (backend, T, dims))) + elseif unified + throw(ArgumentError("`$(typeof(backend))` does not support unified memory. If you believe it does, please open a github issue.")) + else + return allocate(backend, T, dims) + end +end + """ - zeros(::Backend, Type, dims...)::AbstractArray + zeros(::Backend, Type, dims...; unified=false)::AbstractArray Allocate a storage array appropriate for the computational backend filled with zeros. +`unified=true` allocates an array using unified memory if the backend supports it and +throws otherwise. """ -zeros(backend::Backend, T::Type, dims...) = zeros(backend, T, dims) -function zeros(backend::Backend, ::Type{T}, dims::Tuple) where {T} - data = allocate(backend, T, dims...) +zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...) +function zeros(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T} + data = allocate(backend, T, dims...; kwargs...) fill!(data, zero(T)) return data end """ - ones(::Backend, Type, dims...)::AbstractArray + ones(::Backend, Type, dims...; unified=false)::AbstractArray Allocate a storage array appropriate for the computational backend filled with ones. +`unified=true` allocates an array using unified memory if the backend supports it and +throws otherwise. """ -ones(backend::Backend, T::Type, dims...) = ones(backend, T, dims) -function ones(backend::Backend, ::Type{T}, dims::Tuple) where {T} - data = allocate(backend, T, dims) +ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...) +function ones(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T} + data = allocate(backend, T, dims; kwargs...) fill!(data, one(T)) return data end +""" + supports_unified(::Backend)::Bool + +Returns whether unified memory arrays are supported by the backend. + +!!! note + Backend implementations **must** implement this function + only if they **do** support unified memory. +""" +supports_unified(::Backend) = false + """ supports_atomics(::Backend)::Bool Returns whether `@atomic` operations are supported by the backend. !!! note - Backend implementations **must** implement this function, + Backend implementations **must** implement this function only if they **do not** support atomic operations with Atomix. """ supports_atomics(::Backend) = true @@ -575,7 +602,7 @@ supports_atomics(::Backend) = true Returns whether `Float64` values are supported by the backend. !!! note - Backend implementations **must** implement this function, + Backend implementations **must** implement this function only if they **do not** support `Float64`. """ supports_float64(::Backend) = true diff --git a/src/cpu.jl b/src/cpu.jl deleted file mode 100644 index e383386f7..000000000 --- a/src/cpu.jl +++ /dev/null @@ -1,224 +0,0 @@ -synchronize(::CPU) = nothing - -allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) - -function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T} - arr = allocate(backend, T, dims) - kernel = init_kernel(backend) - kernel(arr, zero, T, ndrange = length(arr)) - return arr -end -function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T} - arr = allocate(backend, T, dims) - kernel = init_kernel(backend) - kernel(arr, one, T; ndrange = length(arr)) - return arr -end - -function copyto!(backend::CPU, A, B) - if get_backend(A) == get_backend(B) && get_backend(A) isa CPU - if length(A) != length(B) - error("Arrays must match in length") - end - if Base.mightalias(A, B) - error("Arrays may not alias") - end - kernel = copy_kernel(backend) - kernel(A, B, ndrange = length(A)) - return A - else - return Base.copyto!(A, B) - end -end - -functional(::CPU) = true -pagelock!(::CPU, x) = nothing - -function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing) - ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) - - if length(blocks(iterspace)) == 0 - return nothing - end - - __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static) - return nothing -end - -const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size -function default_cpu_workgroupsize(ndrange) - # if the total kernel is small, don't launch multiple tasks - n = prod(ndrange) - if iszero(n) - # If the ndrange is zero return a workgroupsize of (1, 1,...) - return map(one, ndrange) - elseif n <= CPU_GRAINSIZE - return ndrange - else - available = Ref(CPU_GRAINSIZE) - return ntuple(length(ndrange)) do i - dim = ndrange[i] - remaining = available[] - if remaining == 0 - return 1 - elseif remaining <= dim - available[] = 0 - return remaining - else - available[] = remaining รท dim - return dim - end - end - end -end - -@inline function launch_config(kernel::Kernel{CPU}, ndrange, workgroupsize) - if ndrange isa Integer - ndrange = (ndrange,) - end - if workgroupsize isa Integer - workgroupsize = (workgroupsize,) - end - - if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing - workgroupsize = default_cpu_workgroupsize(ndrange) - end - iterspace, dynamic = partition(kernel, ndrange, workgroupsize) - # partition checked that the ndrange's agreed - if KernelAbstractions.ndrange(kernel) <: StaticSize - ndrange = nothing - end - - return ndrange, workgroupsize, iterspace, dynamic -end - -# Inference barriers -function __run(obj, ndrange, iterspace, args, dynamic, static_threads) - N = length(iterspace) - Nthreads = Threads.nthreads() - if Nthreads == 1 - len, rem = N, 0 - else - len, rem = divrem(N, Nthreads) - end - # not enough iterations for all the threads? - if len == 0 - Nthreads = N - len, rem = 1, 0 - end - if Nthreads == 1 - __thread_run(1, len, rem, obj, ndrange, iterspace, args, dynamic) - else - if static_threads - Threads.@threads :static for tid in 1:Nthreads - __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - end - else - @sync for tid in 1:Nthreads - Threads.@spawn __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - end - end - end - return nothing -end - -function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) - # compute this thread's iterations - f = 1 + ((tid - 1) * len) - l = f + len - 1 - # distribute remaining iterations evenly - if rem > 0 - if tid <= rem - f = f + (tid - 1) - l = l + tid - else - f = f + rem - l = l + rem - end - end - # run this thread's iterations - for i in f:l - block = @inbounds blocks(iterspace)[i] - ctx = mkcontext(obj, block, ndrange, iterspace, dynamic) - obj.f(ctx, args...) - end - return nothing -end - -function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic} - return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace) -end - -@inline function __index_Local_Linear(ctx, idx::CartesianIndex) - indices = workitems(__iterspace(ctx)) - return @inbounds LinearIndices(indices)[idx] -end - -@inline function __index_Group_Linear(ctx, idx::CartesianIndex) - indices = blocks(__iterspace(ctx)) - return @inbounds LinearIndices(indices)[__groupindex(ctx)] -end - -@inline function __index_Global_Linear(ctx, idx::CartesianIndex) - I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) - return @inbounds LinearIndices(__ndrange(ctx))[I] -end - -@inline function __index_Local_Cartesian(_, idx::CartesianIndex) - return idx -end - -@inline function __index_Group_Cartesian(ctx, ::CartesianIndex) - return __groupindex(ctx) -end - -@inline function __index_Global_Cartesian(ctx, idx::CartesianIndex) - return @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) -end - -@inline function __validindex(ctx, idx::CartesianIndex) - # Turns this into a noop for code where we can turn of checkbounds of - if __dynamic_checkbounds(ctx) - I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) - return I in __ndrange(ctx) - else - return true - end -end - -### -# CPU implementation of shared memory -### -@inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims} - return MArray{__size(Dims), T}(undef) -end - -### -# CPU implementation of scratch memory -# - memory allocated as a MArray with size `Dims` -### - -struct ScratchArray{N, D} - data::D - ScratchArray{N}(data::D) where {N, D} = new{N, D}(data) -end - -@inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} - return ScratchArray{length(Dims)}(MArray{__size((Dims..., prod(__groupsize(ctx)))), T}(undef)) -end - -# Base.view creates a boundscheck which captures A -# https://github.com/JuliaLang/julia/issues/39308 -@inline function aview(A, I::Vararg{Any, N}) where {N} - J = Base.to_indices(A, I) - return Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...) -end - -@inline function Base.getindex(A::ScratchArray{N}, idx) where {N} - return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx) -end - -# Argument conversion -argconvert(k::Kernel{CPU}, arg) = arg - -supports_enzyme(::CPU) = true diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 8e7fcc083..e733cae37 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -21,16 +21,16 @@ end ## Memory Operations -KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) +KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = Array{T}(undef, dims) -function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} - arr = KA.allocate(backend, T, dims) +function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T} + arr = KA.allocate(backend, T, dims; kwargs...) kernel = KA.init_kernel(backend) kernel(arr, zero, T, ndrange = length(arr)) return arr end -function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} - arr = KA.allocate(backend, T, dims) +function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T} + arr = KA.allocate(backend, T, dims; kwargs...) kernel = KA.init_kernel(backend) kernel(arr, one, T; ndrange = length(arr)) return arr @@ -58,6 +58,7 @@ KA.pagelock!(::POCLBackend, x) = nothing KA.get_backend(::Array) = POCLBackend() KA.synchronize(::POCLBackend) = nothing KA.supports_float64(::POCLBackend) = true +KA.supports_unified(::POCLBackend) = true ## Kernel Launch diff --git a/test/test.jl b/test/test.jl index 53126e881..4f7f9c7af 100644 --- a/test/test.jl +++ b/test/test.jl @@ -77,6 +77,15 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false} @test backend isa backendT + unified = KernelAbstractions.supports_unified(backend) + @test unified isa Bool + U = allocate(backend, Float32, 5; unified) + if unified + @test U[3] isa Float32 + else + @test_throws ErrorException U[3] + end + x = allocate(backend, Float32, 5) A = allocate(backend, Float32, 5, 5) @test @inferred(KernelAbstractions.get_backend(A)) isa backendT