Merge pull request #235 from JuliaGPU/tb/linalg

maleadt · web-flow · commit 4aa430ca05b2 · 2020-01-28T11:21:08.000+01:00
Port some CuArrays linalg kernels.
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
@@ -36,7 +36,7 @@ So it can be used like this:
 
     ```julia
     function kernel(ctx::AbstractKernelContext, A)
-        idx = @linear_index A ctx
+        idx = @linearidx A ctx
         # from here on it's save to index into A with idx
         @inbounds begin
             A[idx] = ...
@@ -56,12 +56,12 @@ end
 """
     cartesianidx(A, ctxsym = :ctx)
 
-Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
+Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns a N-dimensional `CartesianIndex`.
 """
 macro cartesianidx(A, ctxsym = :ctx)
     quote
         x = $(esc(A))
-        i2 = @linearidx(x, $(esc(ctxsym)))
-        gpu_ind2sub(x, i2)
+        i = @linearidx(x, $(esc(ctxsym)))
+        CartesianIndices(x)[i]
     end
 end
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -139,14 +139,12 @@ end
 Base.copyto!(dest::AbstractGPUArray, src::AbstractGPUArray) =
     copyto!(dest, CartesianIndices(dest), src, CartesianIndices(src))
 
-function copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, shape_dest, shape_source, length)
+function copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length)
     i = linear_index(ctx)
     if i <= length
         # TODO can this be done faster and smarter?
-        idx = gpu_ind2sub(shape, i)
-        dest_idx = gpu_sub2ind(shape_dest, idx .+ dest_offsets)
-        src_idx = gpu_sub2ind(shape_source, idx .+ src_offsets)
-        @inbounds dest[dest_idx] = src[src_idx]
+        idx = CartesianIndices(shape)[i]
+        @inbounds dest[idx + dest_offsets] = src[idx + src_offsets]
     end
     return
 end
@@ -159,10 +157,10 @@ function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices
     end
     len = length(destcrange)
 
-    dest_offsets = first.(destcrange.indices) .- 1
-    src_offsets = first.(srccrange.indices) .- 1
+    dest_offsets = first(destcrange) - one(CartesianIndex{N})
+    src_offsets = first(srccrange) - one(CartesianIndex{N})
     gpu_call(copy_kernel!,
-             dest, dest_offsets, src, src_offsets, shape, size(dest), size(src), len;
+             dest, dest_offsets, src, src_offsets, shape, len;
              total_threads=len)
     dest
 end
diff --git a/src/host/base.jl b/src/host/base.jl
@@ -19,40 +19,6 @@ Base.map!(f, y::AbstractGPUArray, x::AbstractGPUArray) =
 Base.map!(f, y::AbstractGPUArray, x1::AbstractGPUArray, x2::AbstractGPUArray) =
     invoke(map!, Tuple{Any,AbstractGPUArray, Vararg{AbstractGPUArray}}, f, y, x1, x2)
 
-
-# Base functions that are sadly not fit for the the GPU yet (they only work for Int64)
-Base.@pure @inline function gpu_ind2sub(A::AbstractArray, ind::T) where T
-    _ind2sub(size(A), ind - T(1))
-end
-Base.@pure @inline function gpu_ind2sub(dims::NTuple{N}, ind::T) where {N, T}
-    _ind2sub(NTuple{N, T}(dims), ind - T(1))
-end
-Base.@pure @inline _ind2sub(::Tuple{}, ind::T) where {T} = (ind + T(1),)
-Base.@pure @inline function _ind2sub(indslast::NTuple{1}, ind::T) where T
-    ((ind + T(1)),)
-end
-Base.@pure @inline function _ind2sub(inds, ind::T) where T
-    r1 = inds[1]
-    indnext = div(ind, r1)
-    f = T(1); l = r1
-    (ind-l*indnext+f, _ind2sub(Base.tail(inds), indnext)...)
-end
-
-Base.@pure function gpu_sub2ind(dims::NTuple{N}, I::NTuple{N2, T}) where {N, N2, T}
-    Base.@_inline_meta
-    _sub2ind(NTuple{N, T}(dims), T(1), T(1), I...)
-end
-_sub2ind(x, L, ind) = ind
-function _sub2ind(::Tuple{}, L, ind, i::T, I::T...) where T
-    Base.@_inline_meta
-    ind + (i - T(1)) * L
-end
-function _sub2ind(inds, L, ind, i::IT, I::IT...) where IT
-    Base.@_inline_meta
-    r1 = inds[1]
-    _sub2ind(Base.tail(inds), L * r1, ind + (i - IT(1)) * L, I...)
-end
-
 # This is pretty ugly, but I feel bad to add those to device arrays, since
 # we're never bound checking... So getindex(a::AbstractGPUVector, 10, 10) would silently go unnoticed
 # we need this here for easier implementation of repeat
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
@@ -84,9 +84,8 @@ to_index(a, x::Base.LogicalIndex) = error("Logical indexing not implemented")
 @generated function index_kernel(ctx::AbstractKernelContext, dest::AbstractArray, src::AbstractArray, idims, Is)
     N = length(Is.parameters)
     quote
-        i = linear_index(ctx)
-        i > length(dest) && return
-        is = gpu_ind2sub(idims, i)
+        i = @linearidx dest
+        is = CartesianIndices(idims)[i]
         @nexprs $N i -> @inbounds I_i = Is[i][is[i]]
         @inbounds dest[i] = @ncall $N getindex src i -> I_i
         return
@@ -112,7 +111,7 @@ end
     quote
         i = linear_index(ctx)
         i > len && return
-        is = gpu_ind2sub(idims, i)
+        is = CartesianIndices(idims)[i]
         @inbounds setindex!(dest, bgetindex(src, i), $(idx...))
         return
     end
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
@@ -9,15 +9,15 @@ function LinearAlgebra.transpose!(At::AbstractGPUArray{T, 2}, A::AbstractGPUArra
     At
 end
 
-function genperm(I::NTuple{N}, perm::NTuple{N}) where N
-    ntuple(d-> (@inbounds return I[perm[d]]), Val(N))
+function genperm(I::CartesianIndex{N}, perm::NTuple{N}) where N
+    CartesianIndex(ntuple(d-> (@inbounds return I[perm[d]]), Val(N)))
 end
 
 function LinearAlgebra.permutedims!(dest::AbstractGPUArray, src::AbstractGPUArray, perm) where N
     perm isa Tuple || (perm = Tuple(perm))
     gpu_call(dest, src, perm) do ctx, dest, src, perm
         I = @cartesianidx src ctx
-        @inbounds dest[genperm(I, perm)...] = src[I...]
+        @inbounds dest[genperm(I, perm)] = src[I]
         return
     end
     return dest
@@ -39,3 +39,111 @@ end
 function Base.copyto!(A::AbstractGPUArray, B::Adjoint{T, <: AbstractGPUArray}) where T
     transpose!(A, B.parent)
 end
+
+function LinearAlgebra.tril!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
+  function kernel!(ctx, _A, _d)
+    I = @cartesianidx _A
+    i, j = Tuple(I)
+    if i < j - _d
+      _A[i, j] = 0
+    end
+    return nothing
+  end
+
+  gpu_call(kernel!, A, d)
+  return A
+end
+
+function LinearAlgebra.triu!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
+  function kernel!(ctx, _A, _d)
+    I = @cartesianidx _A
+    i, j = Tuple(I)
+    if j < i + _d
+      _A[i, j] = 0
+    end
+    return nothing
+  end
+
+  gpu_call(kernel!, A, d)
+  return A
+end
+
+function LinearAlgebra.copy_transpose!(dst::AbstractGPUArray, src::AbstractGPUArray)
+  function kernel(ctx, dst, src)
+    I = @cartesianidx dst
+    dst[I...] = src[reverse(I)...]
+    return
+  end
+
+  gpu_call(kernel, dst, src)
+  return dst
+end
+
+
+# matrix multiplication
+
+function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R}
+    if size(A,2) != size(B,1)
+        throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))"))
+    end
+    if size(C,1) != size(A,1) || size(C,2) != size(B,2)
+        throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))"))
+    end
+    if isempty(A) || isempty(B)
+        return fill!(C, zero(R))
+    end
+
+    function kernel(ctx, C, A, B)
+        idx = @linearidx C
+        i, j = Tuple(CartesianIndices(C)[idx])
+
+        if i <= size(A,1) && j <= size(B,2)
+            z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j])
+            Ctmp = convert(promote_type(R, typeof(z2)), z2)
+            for k in 1:size(A,2)
+                Ctmp += A[i, k]*B[k, j]
+            end
+            C[i,j] = Ctmp
+        end
+
+        return
+    end
+
+    gpu_call(kernel, C, A, B)
+
+    C
+end
+
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
+
+function generic_rmul!(X::AbstractGPUArray, s::Number)
+    function kernel(ctx, X, s)
+        i = @linearidx X
+        @inbounds X[i] *= s
+        return
+    end
+    gpu_call(kernel, X, s)
+    X
+end
+
+LinearAlgebra.rmul!(A::AbstractGPUArray, b::Number) = generic_rmul!(A, b)
+
+function generic_lmul!(s::Number, X::AbstractGPUArray)
+    function kernel(ctx, X, s)
+        i = @linearidx X
+        @inbounds X[i] = s*X[i]
+        return
+    end
+    gpu_call(kernel, X, s)
+    X
+end
+
+LinearAlgebra.lmul!(a::Number, B::AbstractGPUArray) = generic_lmul!(a, B)
diff --git a/src/host/math.jl b/src/host/math.jl
@@ -3,7 +3,7 @@ import Base.clamp!
 function Base.clamp!(A::AbstractGPUArray, low, high)
     function kernel(state, A, low, high)
         I = @cartesianidx A state
-        A[I...] = clamp(A[I...], low, high)
+        A[I] = clamp(A[I], low, high)
         return
     end
     gpu_call(kernel, A, low, high)
diff --git a/src/reference.jl b/src/reference.jl
@@ -59,11 +59,11 @@ function GPUArrays.gpu_call(::JLBackend, f, args...; blocks::Int, threads::Int)
     ctx = JLKernelContext(threads, blocks)
     device_args = to_device.(Ref(ctx), args)
     tasks = Array{Task}(undef, threads)
-    for blockidx in 1:blocks
+    @allowscalar for blockidx in 1:blocks
         ctx.blockidx = blockidx
         for threadidx in 1:threads
             thread_ctx = JLKernelContext(ctx, threadidx)
-            tasks[threadidx] = @async @allowscalar f(thread_ctx, device_args...)
+            tasks[threadidx] = @async f(thread_ctx, device_args...)
             # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
             #       (this would require a different synchronization mechanism)
         end
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -1,7 +1,6 @@
 function cartesian_iter(state, res, A, Asize)
     for i in CartesianIndices(Asize)
-        idx = GPUArrays.gpu_sub2ind(Asize, i.I)
-        res[idx] = A[idx]
+        res[i] = A[i]
     end
     return
 end
diff --git a/test/testsuite/linalg.jl b/test/testsuite/linalg.jl
@@ -6,14 +6,14 @@ function test_linalg(AT)
             @test compare(transpose!, AT, Array{Float32}(undef, 32, 32), rand(Float32, 32, 32))
             @test compare(transpose!, AT, Array{Float32}(undef, 128, 32), rand(Float32, 32, 128))
         end
-        
+
         @testset "copyto! for triangular" begin
-            ga = Array{Float32}(undef, 128, 128) 
+            ga = Array{Float32}(undef, 128, 128)
             gb = AT{Float32}(undef, 128, 128)
             rand!(gb)
             copyto!(ga, UpperTriangular(gb))
             @test ga == Array(collect(UpperTriangular(gb)))
-            ga = Array{Float32}(undef, 128, 128) 
+            ga = Array{Float32}(undef, 128, 128)
             gb = AT{Float32}(undef, 128, 128)
             rand!(gb)
             copyto!(ga, LowerTriangular(gb))
@@ -49,5 +49,44 @@ function test_linalg(AT)
             B = A + D
             @test collect(B) ≈ collect(A) + collect(D)
         end
+
+        @testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)),
+                                            d in -2:2
+            A = randn(10, 10)
+            @test f(A, d) == Array(f!(AT(A), d))
+        end
+
+        @testset "matrix multiplication" begin
+            a = rand(Int8, 3, 3)
+            b = rand(Int8, 3, 3)
+            d_a = AT{Int8}(a)
+            d_b = AT{Int8}(b)
+            d_c = d_a*d_b
+            @test collect(d_c) == a*b
+            a = rand(Complex{Int8}, 3, 3)
+            b = rand(Complex{Int8}, 3, 3)
+            d_a = AT{Complex{Int8}}(a)
+            d_b = AT{Complex{Int8}}(b)
+            d_c = d_a'*d_b
+            @test collect(d_c) == a'*b
+            d_c = d_a*d_b'
+            @test collect(d_c) == a*b'
+            d_c = d_a'*d_b'
+            @test collect(d_c) == a'*b'
+            d_c = transpose(d_a)*d_b'
+            @test collect(d_c) == transpose(a)*b'
+            d_c = d_a'*transpose(d_b)
+            @test collect(d_c) == a'*transpose(b)
+            d_c = transpose(d_a)*d_b
+            @test collect(d_c) == transpose(a)*b
+            d_c = d_a*transpose(d_b)
+            @test collect(d_c) == a*transpose(b)
+            d_c = transpose(d_a)*transpose(d_b)
+            @test collect(d_c) == transpose(a)*transpose(b)
+            d_c = rmul!(copy(d_a), Complex{Int8}(2, 2))
+            @test collect(d_c) == a*Complex{Int8}(2, 2)
+            d_c = lmul!(Complex{Int8}(2, 2), copy(d_a))
+            @test collect(d_c) == Complex{Int8}(2, 2)*a
+        end
     end
 end