Merge pull request #207 from JuliaGPU/tb/axes

maleadt · web-flow · commit ab7ac23731ef · 2019-09-17T13:21:50.000+02:00
Remove hack and properly reimplement Broadcast._bcs1.
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -38,4 +38,6 @@ include("array.jl")
 
 include("testsuite.jl")
 
+include("quirks.jl")
+
 end # module
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -131,15 +131,13 @@ for i = 0:10
     fargs = ntuple(x-> :(simple_broadcast_index($(args[x]), cartesian_global_index...)), i)
     @eval begin
         # http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
-        function reduce_kernel(state, f, op, v0::T, A, len, ax, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
+        function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
             tmp_local = @LocalMemory(state, T, LMEM)
             global_index = linear_index(state)
             acc = v0
             # # Loop sequentially over chunks of input vector
-            # HACK: length(A) and axes(A) aren't GPU compatible, so pass them instead
-            #       https://github.com/JuliaGPU/CUDAnative.jl/issues/367
-            @inbounds while global_index <= len
-                cartesian_global_index = Tuple(CartesianIndices(ax)[global_index])
+            @inbounds while global_index <= length(A)
+                cartesian_global_index = Tuple(CartesianIndices(axes(A))[global_index])
                 @inbounds element = f(A[cartesian_global_index...], $(fargs...))
                 acc = op(acc, element)
                 global_index += global_size(state)
@@ -184,7 +182,7 @@ function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest::Tuple) where {OT}
     end
     out = similar(A, OT, (blocksize,))
     fill!(out, v0)
-    args = (f, op, v0, A, length(A), axes(A), Val{threads}(), out, rest...)
+    args = (f, op, v0, A, Val{threads}(), out, rest...)
     gpu_call(reduce_kernel, out, args, ((blocksize,), (threads,)))
     reduce(op, Array(out))
 end
diff --git a/src/quirks.jl b/src/quirks.jl
@@ -0,0 +1,27 @@
+# revert JuliaLang/julia#32867; avoid string interpolation
+#
+# NOTE: without contextual dispatch, we can only redefine methods where a GPU-specific
+#       type occurs in the signature (or we'll get a "fatal precompilation failure" error)
+
+if VERSION >= v"1.3.0-alpha.107"
+    _bcs1(a::Integer, b::Integer) = a == 1 ? b : (b == 1 ? a : (a == b ? a : throw(DimensionMismatch("arrays could not be broadcast to a common size"))))
+    _bcs1(a::Integer, b) = a == 1 ? b : (first(b) == 1 && last(b) == a ? b : throw(DimensionMismatch("arrays could not be broadcast to a common size")))
+    _bcs1(a, b::Integer) = _bcs1(b, a)
+    _bcs1(a, b) = Broadcast._bcsm(b, a) ? Broadcast.axistype(b, a) : (Broadcast._bcsm(a, b) ? Broadcast.axistype(a, b) : throw(DimensionMismatch("arrays could not be broadcast to a common size")))
+
+    _bcs(::Tuple{}, ::Tuple{}) = ()
+    _bcs(::Tuple{}, newshape::Tuple) = (newshape[1], _bcs((), Base.tail(newshape))...)
+    _bcs(shape::Tuple, ::Tuple{}) = (shape[1], _bcs(Base.tail(shape), ())...)
+    function _bcs(shape::Tuple, newshape::Tuple)
+        return (_bcs1(shape[1], newshape[1]), _bcs(Base.tail(shape), Base.tail(newshape))...)
+    end
+
+    broadcast_shape(shape::Tuple) = shape
+    broadcast_shape(shape::Tuple, shape1::Tuple, shapes::Tuple...) = broadcast_shape(_bcs(shape, shape1), shapes...)
+
+    @inline combine_axes(A, B...) = broadcast_shape(axes(A), combine_axes(B...))
+    combine_axes(A) = axes(A)
+
+    Broadcast._axes(::Broadcasted{ArrayStyle{AT}}, axes::Tuple) where {AT <: GPUArray} = axes
+    @inline Broadcast._axes(bc::Broadcasted{ArrayStyle{AT}}, ::Nothing) where {AT <: GPUArray} = combine_axes(bc.args...)
+end