Remove mapreduce implementation, expect users to provide mapreducedim! impl.

maleadt · maleadt · commit 9443b0fe31f7 · 2020-02-24T14:23:27.000+01:00
diff --git a/src/host/mapreduce.jl b/src/host/mapreduce.jl
@@ -1,180 +1,62 @@
 # map-reduce
 
-Base.any(A::AbstractGPUArray{Bool}) = mapreduce(identity, |, A; init = false)
-Base.all(A::AbstractGPUArray{Bool}) = mapreduce(identity, &, A; init = true)
-
-Base.any(f::Function, A::AbstractGPUArray) = mapreduce(f, |, A; init = false)
-Base.all(f::Function, A::AbstractGPUArray) = mapreduce(f, &, A; init = true)
-Base.count(pred::Function, A::AbstractGPUArray) = Int(mapreduce(pred, +, A; init = 0))
-
-Base.:(==)(A::AbstractGPUArray, B::AbstractGPUArray) = Bool(mapreduce(==, &, A, B; init = true))
-
-LinearAlgebra.ishermitian(A::AbstractGPUMatrix) = acc_mapreduce(==, &, true, A, adjoint(A))
-
-# hack to get around of fetching the first element of the AbstractGPUArray
-# as a startvalue, which is a bit complicated with the current reduce implementation
-_initerror(f) = error("Please supply a neutral element for $f. E.g: mapreduce(f, $f, A; init = 1)")
-startvalue(f, T) = _initerror(f)
-for op = (+, Base.add_sum, *, Base.mul_prod, max, min)
-    @eval startvalue(::typeof($op), ::Type{Any}) = _initerror($op)
-end
-
-startvalue(::typeof(+), T) = zero(T)
-startvalue(::typeof(Base.add_sum), T) = zero(T)
-startvalue(::typeof(*), T) = one(T)
-startvalue(::typeof(Base.mul_prod), T) = one(T)
-
-startvalue(::typeof(max), T) = typemin(T)
-startvalue(::typeof(min), T) = typemax(T)
-
-# TODO mirror base
-
-if Int === Int32
-const SmallSigned = Union{Int8,Int16}
-const SmallUnsigned = Union{UInt8,UInt16}
-else
-const SmallSigned = Union{Int8,Int16,Int32}
-const SmallUnsigned = Union{UInt8,UInt16,Int}
-end
-
-const CommonReduceResult = Union{UInt64,UInt128,Int64,Int128,Float16,Float32,Float64}
-const WidenReduceResult = Union{SmallSigned, SmallUnsigned}
-
-
-# TODO widen and support Int64 and use Base.r_promote_type
-gpu_promote_type(op, ::Type{T}) where {T} = T
-gpu_promote_type(op, ::Type{T}) where {T<: WidenReduceResult} = T
-gpu_promote_type(::typeof(+), ::Type{T}) where {T<: WidenReduceResult} = T
-gpu_promote_type(::typeof(*), ::Type{T}) where {T<: WidenReduceResult} = T
-gpu_promote_type(::typeof(Base.add_sum), ::Type{T}) where {T<:WidenReduceResult} = typeof(Base.add_sum(zero(T), zero(T)))
-gpu_promote_type(::typeof(Base.mul_prod), ::Type{T}) where {T<:WidenReduceResult} = typeof(Base.mul_prod(one(T), one(T)))
-gpu_promote_type(::typeof(+), ::Type{T}) where {T<:Number} = typeof(zero(T)+zero(T))
-gpu_promote_type(::typeof(*), ::Type{T}) where {T<:Number} = typeof(one(T)*one(T))
-gpu_promote_type(::typeof(Base.add_sum), ::Type{T}) where {T<:Number} = typeof(Base.add_sum(zero(T), zero(T)))
-gpu_promote_type(::typeof(Base.mul_prod), ::Type{T}) where {T<:Number} = typeof(Base.mul_prod(one(T), one(T)))
-gpu_promote_type(::typeof(max), ::Type{T}) where {T<: WidenReduceResult} = T
-gpu_promote_type(::typeof(min), ::Type{T}) where {T<: WidenReduceResult} = T
-gpu_promote_type(::typeof(abs), ::Type{Complex{T}}) where {T} = T
-gpu_promote_type(::typeof(abs2), ::Type{Complex{T}}) where {T} = T
-
-import Base.Broadcast: Broadcasted
-const GPUSrcArray = Union{Broadcasted{<:AbstractGPUArrayStyle}, <:AbstractGPUArray}
-
-function Base.mapreduce(f::Function, op::Function, A::GPUSrcArray; dims = :, init...)
-    mapreduce_impl(f, op, init.data, A, dims)
-end
-
-function mapreduce_impl(f, op, ::NamedTuple{()}, A::GPUSrcArray, ::Colon)
-    OT = gpu_promote_type(op, gpu_promote_type(f, eltype(A)))
-    v0 = startvalue(op, OT) # TODO do this better
-    acc_mapreduce(f, op, v0, A)
-end
-
-function mapreduce_impl(f, op, nt::NamedTuple{(:init,)}, A::GPUSrcArray, ::Colon)
-    acc_mapreduce(f, op, nt.init, A)
-end
-
-function mapreduce_impl(f, op, nt, A::GPUSrcArray, dims)
-    Base._mapreduce_dim(f, op, nt, A, dims)
-end
+# GPUArrays' mapreduce methods build on `Base.mapreducedim!`, but with an additional
+# argument `init` value to avoid eager initialization of `R` (if set to something).
+mapreducedim!(f, op, R::AbstractGPUArray, A::AbstractArray, init=nothing) = error("Not implemented") # COV_EXCL_LINE
+Base.mapreducedim!(f, op, R::AbstractGPUArray, A::AbstractArray) = mapreducedim!(f, op, R, A)
+
+neutral_element(op, T) =
+    error("""GPUArrays.jl needs to know the neutral element for your operator `$op`.
+             Please pass it as an explicit argument to (if possible), or register it
+             globally your operator by defining `GPUArrays.neutral_element(::typeof($op), T)`.""")
+neutral_element(::typeof(Base.:(|)), T) = zero(T)
+neutral_element(::typeof(Base.:(+)), T) = zero(T)
+neutral_element(::typeof(Base.add_sum), T) = zero(T)
+neutral_element(::typeof(Base.:(&)), T) = one(T)
+neutral_element(::typeof(Base.:(*)), T) = one(T)
+neutral_element(::typeof(Base.mul_prod), T) = one(T)
+neutral_element(::typeof(Base.min), T) = typemax(T)
+neutral_element(::typeof(Base.max), T) = typemin(T)
+
+function Base.mapreduce(f, op, A::AbstractGPUArray; dims=:, init=nothing)
+    # figure out the destination container type by looking at the initializer element,
+    # or by relying on inference to reason through the map and reduce functions.
+    if init === nothing
+        ET = Base.promote_op(f, eltype(A))
+        ET = Base.promote_op(op, ET, ET)
+        (ET === Union{} || ET === Any) &&
+            error("mapreduce cannot figure the output element type, please pass an explicit init value")
+
+        init = neutral_element(op, ET)
+    else
+        ET = typeof(init)
+    end
 
-function acc_mapreduce end
-function Base.mapreduce(f, op, A::GPUSrcArray, B::GPUSrcArray, C::Number; init)
-    acc_mapreduce(f, op, init, A, B, C)
-end
-function Base.mapreduce(f, op, A::GPUSrcArray, B::GPUSrcArray; init)
-    acc_mapreduce(f, op, init, A, B)
-end
+    sz = size(A)
+    red = ntuple(i->(dims==Colon() || i in dims) ? 1 : sz[i], ndims(A))
+    R = similar(A, ET, red)
+    mapreducedim!(f, op, R, A, init)
 
-@generated function mapreducedim_kernel(ctx::AbstractKernelContext, f, op, R, A, range::NTuple{N, Any}) where N
-    types = (range.parameters...,)
-    indices = ntuple(i-> Symbol("I_$i"), N)
-    Iexpr = ntuple(i-> :(I[$i]), N)
-    body = :(@inbounds R[$(Iexpr...)] = op(R[$(Iexpr...)], f(A[$(indices...)])))
-    for i = N:-1:1
-        idxsym = indices[i]
-        if types[i] == Nothing
-            body = quote
-                $idxsym = I[$i]
-                $body
-            end
-        else
-            rsym = Symbol("r_$i")
-            body = quote
-                $(rsym) = range[$i]
-                for $idxsym in Int(first($rsym)):Int(last($rsym))
-                    $body
-                end
-            end
-        end
-        body
-    end
-    quote
-        I = @cartesianidx R ctx
-        $body
-        return
+    if dims==Colon()
+        @allowscalar R[]
+    else
+        R
     end
 end
 
-function Base._mapreducedim!(f, op, R::AbstractGPUArray, A::GPUSrcArray)
-    range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing)
-    gpu_call(mapreducedim_kernel, f, op, R, A, range; target=R)
-    return R
-end
-
-@inline simple_broadcast_index(A::AbstractArray, i...) = @inbounds A[i...]
-@inline simple_broadcast_index(x, i...) = x
+Base.any(A::AbstractGPUArray{Bool}) = mapreduce(identity, |, A)
+Base.all(A::AbstractGPUArray{Bool}) = mapreduce(identity, &, A)
 
-for i = 0:10
-    args = ntuple(x-> Symbol("arg_", x), i)
-    fargs = ntuple(x-> :(simple_broadcast_index($(args[x]), cartesian_global_index...)), i)
-    @eval begin
-        # http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
-        function reduce_kernel(ctx::AbstractKernelContext, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
-            tmp_local = @LocalMemory(ctx, T, LMEM)
-            global_index = linear_index(ctx)
-            acc = v0
-            # # Loop sequentially over chunks of input vector
-            @inbounds while global_index <= length(A)
-                cartesian_global_index = Tuple(CartesianIndices(axes(A))[global_index])
-                @inbounds element = f(A[cartesian_global_index...], $(fargs...))
-                acc = op(acc, element)
-                global_index += global_size(ctx)
-            end
-            # Perform parallel reduction
-            local_index = threadidx(ctx) - 1
-            @inbounds tmp_local[local_index + 1] = acc
-            synchronize_threads(ctx)
+Base.any(f::Function, A::AbstractGPUArray) = mapreduce(f, |, A)
+Base.all(f::Function, A::AbstractGPUArray) = mapreduce(f, &, A)
+Base.count(pred::Function, A::AbstractGPUArray) = mapreduce(pred, +, A; init = 0)
 
-            offset = blockdim(ctx) ÷ 2
-            @inbounds while offset > 0
-                if (local_index < offset)
-                    other = tmp_local[local_index + offset + 1]
-                    mine = tmp_local[local_index + 1]
-                    tmp_local[local_index + 1] = op(mine, other)
-                end
-                synchronize_threads(ctx)
-                offset = offset ÷ 2
-            end
-            if local_index == 0
-                @inbounds result[blockidx(ctx)] = tmp_local[1]
-            end
-            return
-        end
-    end
+Base.:(==)(A::AbstractGPUArray, B::AbstractGPUArray) = Bool(mapreduce(==, &, A, B))
 
-end
+# avoid calling into `initarray!``
+Base.sum!(R::AbstractGPUArray, A::AbstractGPUArray) = Base.reducedim!(Base.add_sum, R, A)
+Base.prod!(R::AbstractGPUArray, A::AbstractGPUArray) = Base.reducedim!(Base.mul_prod, R, A)
+Base.maximum!(R::AbstractGPUArray, A::AbstractGPUArray) = Base.reducedim!(max, R, A)
+Base.minimum!(R::AbstractGPUArray, A::AbstractGPUArray) = Base.reducedim!(min, R, A)
 
-function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest...) where {OT}
-    blocks = 80
-    threads = 256
-    if length(A) <= blocks * threads
-        args = zip(convert_to_cpu(A), convert_to_cpu.(rest)...)
-        return mapreduce(x-> f(x...), op, args, init = v0)
-    end
-    out = similar(A, OT, (blocks,))
-    fill!(out, v0)
-    gpu_call(reduce_kernel, f, op, v0, A, Val{threads}(), out, rest...;
-             target=out, threads=threads, blocks=blocks)
-    reduce(op, Array(out))
-end
+LinearAlgebra.ishermitian(A::AbstractGPUMatrix) = mapreduce(==, &, A, adjoint(A))
diff --git a/src/reference.jl b/src/reference.jl
@@ -284,4 +284,11 @@ Adapt.adapt_storage(::Adaptor, x::JLArray{T,N}) where {T,N} =
 GPUArrays.unsafe_reinterpret(::Type{T}, A::JLArray, size::Tuple) where T =
     reshape(reinterpret(T, A.data), size)
 
+function GPUArrays.mapreducedim!(f, op, R::JLArray, A::AbstractArray, init=nothing)
+    if init !== nothing
+        fill!(R, init)
+    end
+    @allowscalar Base.mapreducedim!(f, op, R.data, A)
+end
+
 end
diff --git a/test/testsuite/broadcasting.jl b/test/testsuite/broadcasting.jl
@@ -56,9 +56,9 @@ function broadcasting(AT)
             @testset "Adjoint and Transpose" begin
                 A = AT(rand(ET, N))
                 A' .= ET(2)
-                @test all(x->x==ET(2), A)
+                @test all(isequal(ET(2)'), A)
                 transpose(A) .= ET(1)
-                @test all(x->x==ET(1), A)
+                @test all(isequal(ET(1)), A)
             end
 
             ############
diff --git a/test/testsuite/mapreduce.jl b/test/testsuite/mapreduce.jl
@@ -1,4 +1,118 @@
 function test_mapreduce(AT)
+    @testset "mapreducedim! $ET" for ET in supported_eltypes() begin
+        T = AT{ET}
+        range = ET <: Real ? (ET(1):ET(10)) : ET
+        for (sz,red) in [(10,)=>(1,), (10,10)=>(1,1), (10,10,10)=>(1,1,1), (10,10,10)=>(10,10,10),
+                         (10,10,10)=>(1,10,10), (10,10,10)=>(10,1,10), (10,10,10)=>(10,10,1)]
+            @test compare((A,R)->Base.mapreducedim!(identity, +, R, A), AT, rand(range, sz), zeros(ET, red))
+            @test compare((A,R)->Base.mapreducedim!(identity, *, R, A), AT, rand(range, sz), ones(ET, red))
+            @test compare((A,R)->Base.mapreducedim!(x->x+x, +, R, A), AT, rand(range, sz), zeros(ET, red))
+            return
+        end
+    end
+    end
+
+    @testset "reducedim! $ET" for ET in supported_eltypes() begin
+        T = AT{ET}
+        range = ET <: Real ? (ET(1):ET(10)) : ET
+        for (sz,red) in [(10,)=>(1,), (10,10)=>(1,1), (10,10,10)=>(1,1,1), (10,10,10)=>(10,10,10),
+                         (10,10,10)=>(1,10,10), (10,10,10)=>(10,1,10), (10,10,10)=>(10,10,1)]
+            @test compare((A,R)->Base.reducedim!(+, R, A), AT, rand(range, sz), zeros(ET, red))
+            @test compare((A,R)->Base.reducedim!(*, R, A), AT, rand(range, sz), ones(ET, red))
+        end
+    end
+    end
+
+    @testset "mapreduce $ET" for ET in supported_eltypes() begin
+        T = AT{ET}
+        range = ET <: Real ? (ET(1):ET(10)) : ET
+        for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
+                          (10,)=>:, (10,10)=>:, (10,10,10)=>:,
+                          (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3]]
+            @test compare(A->mapreduce(identity, +, A; dims=dims, init=zero(ET)), AT, rand(range, sz))
+            @test compare(A->mapreduce(identity, *, A; dims=dims, init=one(ET)), AT, rand(range, sz))
+            @test compare(A->mapreduce(x->x+x, +, A; dims=dims, init=zero(ET)), AT, rand(range, sz))
+        end
+    end
+    end
+
+    @testset "reduce $ET" for ET in supported_eltypes() begin
+        T = AT{ET}
+        range = ET <: Real ? (ET(1):ET(10)) : ET
+        for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
+                          (10,)=>:, (10,10)=>:, (10,10,10)=>:,
+                          (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3]]
+            @test compare(A->reduce(+, A; dims=dims, init=zero(ET)), AT, rand(range, sz))
+            @test compare(A->reduce(*, A; dims=dims, init=one(ET)), AT, rand(range, sz))
+        end
+    end
+    end
+
+    @testset "sum prod minimum maximum $ET" for ET in supported_eltypes() begin
+        T = AT{ET}
+        range = ET <: Real ? (ET(1):ET(10)) : ET
+        for (sz,dims) in [(10,)=>[1], (10,10)=>[1,2], (10,10,10)=>[1,2,3], (10,10,10)=>[],
+                          (10,)=>:, (10,10)=>:, (10,10,10)=>:,
+                          (10,10,10)=>[1], (10,10,10)=>[2], (10,10,10)=>[3]]
+            @test compare(A->sum(A), AT, rand(range, sz))
+            @test compare(A->sum(abs, A), AT, rand(range, sz))
+            @test compare(A->sum(A; dims=dims), AT, rand(range, sz))
+            @test compare(A->prod(A), AT, rand(range, sz))
+            @test compare(A->prod(abs, A), AT, rand(range, sz))
+            @test compare(A->prod(A; dims=dims), AT, rand(range, sz))
+            if !(ET <: Complex)
+                @test compare(A->minimum(A), AT, rand(range, sz))
+                @test compare(A->minimum(x->x*x, A), AT, rand(range, sz))
+                @test compare(A->minimum(A; dims=dims), AT, rand(range, sz))
+                @test compare(A->maximum(A), AT, rand(range, sz))
+                @test compare(A->maximum(x->x*x, A), AT, rand(range, sz))
+                @test compare(A->maximum(A; dims=dims), AT, rand(range, sz))
+            end
+        end
+        OT = isbitstype(widen(ET)) ? widen(ET) : ET
+        for (sz,red) in [(10,)=>(1,), (10,10)=>(1,1), (10,10,10)=>(1,1,1), (10,10,10)=>(10,10,10),
+                         (10,10,10)=>(1,10,10), (10,10,10)=>(10,1,10), (10,10,10)=>(10,10,1)]
+            if !(ET <: Complex)
+                @test compare((A,R)->minimum!(R, A), AT, rand(range, sz), fill(typemax(ET), red))
+                @test compare((A,R)->maximum!(R, A), AT, rand(range, sz), fill(typemin(ET), red))
+            end
+        end
+        # smaller-scale test to avoid very large values and roundoff issues
+        for (sz,red) in [(2,)=>(1,), (2,2)=>(1,1), (2,2,2)=>(1,1,1), (2,2,2)=>(2,2,2),
+                         (2,2,2)=>(1,2,2), (2,2,2)=>(2,1,2), (2,2,2)=>(2,2,1)]
+            @test compare((A,R)->sum!(R, A), AT, rand(range, sz), zeros(OT, red))
+            @test compare((A,R)->prod!(R, A), AT, rand(range, sz), ones(OT, red))
+        end
+    end
+    end
+
+    @testset "any all count ==" begin
+        for Ac in ([false, false], [false, true], [true, true],
+                   [false false; false false], [false true; false false],
+                   [true true; false false], [true true; true true])
+            @test compare(A->any(A), AT, Ac)
+            @test compare(A->all(A), AT, Ac)
+        end
+        for Ac in ([1, 1], [1, 2], [2, 2],
+                   [1 1; 1 1], [1 2; 1 1],
+                   [2 2; 1 1], [2 2; 2 2])
+            @test compare(A->any(iseven, A), AT, Ac)
+            @test compare(A->all(iseven, A), AT, Ac)
+            @test compare(A->count(iseven, A), AT, Ac)
+
+            A = AT(Ac)
+            @test A == copy(A)
+            @test A !== copy(A)
+            @test A == deepcopy(A)
+            @test A !== deepcopy(A)
+
+            B = similar(A)
+            @allowscalar B[1] = 3
+            @test A != B
+        end
+    end
+
+    # old tests: can be removed, but left in here for a while to ensure the new impl works
     @testset "mapreduce" begin
         for ET in supported_eltypes()
             T = AT{ET}
@@ -36,14 +150,6 @@ function test_mapreduce(AT)
                         ET <: Complex || @test compare(minimum, AT,rand(range, dims))
                     end
                 end
-
-                @testset "broadcasted arrays" begin
-                    for dims in ((4048,), (1024,1024), (77,), (1923,209))
-                        @test compare(x->mapreduce(z -> z + one(z), +,
-                                                   Broadcast.Broadcasted(+, (x, x));
-                                                   init = zero(ET)), AT, rand(range, dims))
-                    end
-                end
             end
         end
         @testset "any all ==" begin