implementation for batch-wise matrix multiplication

Roger-luo · chengchingwen · commit 4a8251bebdad · 2019-03-27T22:27:23.000+08:00
diff --git a/src/NNlib.jl b/src/NNlib.jl
@@ -3,13 +3,14 @@ module NNlib
 using Requires, Libdl
 
 export σ, sigmoid, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign, logσ, logsigmoid,
-  softmax, logsoftmax, maxpool, meanpool
+  softmax, logsoftmax, maxpool, meanpool, batchedmul
 
 include("numeric.jl")
 include("activation.jl")
 include("softmax.jl")
 include("logsoftmax.jl")
 include("linalg.jl")
+include("batchedmul.jl")
 include("conv.jl")
 include("cubroadcast.jl")
 
diff --git a/src/batchedmul.jl b/src/batchedmul.jl
@@ -0,0 +1,36 @@
+# batch-wise matrix multiplication
+# wrapper for batched_gemm!
+
+function batchedmul(a::AbstractArray{T, 3}, b::AbstractArray{T, 3};
+                    transA::Bool = false, transB::Bool = false) where T
+    (bs = size(a, 3)) == size(b, 3) || error("batch size mismatch")
+    res = similar(a, size(a, transA ? 2 : 1), size(b, transB ? 1 : 2), bs)
+    batched_mul!(res, a, b; transA=transA, transB=transB)
+    return res
+end
+
+function batched_mul!(C::AbstractArray{T, 3}, A::AbstractArray{T, 3}, B::AbstractArray{T, 3};
+                      transA::Bool = false, transB::Bool = false) where T
+    At = transA ? 'T' : 'N'
+    Bt = transB ? 'T' : 'N'
+    batched_gemm!(At, Bt, one(T), A, B, zero(T), C)
+    C
+end
+
+#gradient function for batchedmul
+function ∇batchedmul(Δ::AbstractArray{T, 3}, a::AbstractArray{T, 3}, b::AbstractArray{T, 3};
+                     transA::Bool = false, transB::Bool = false) where T
+    if transA
+        if transB
+            (batchedmul(b, Δ; transA=true, transB=true), batchedmul(Δ, a; transA=true, transB=true))
+        else
+            (batchedmul(b, Δ; transB=true), batchedmul(a, Δ))
+        end
+    else
+        if transB
+            (batchedmul(Δ, b), batchedmul(Δ, a; transA=true))
+        else
+            (batchedmul(Δ, b; transB=true), batchedmul(a, Δ; transA=true))
+        end
+    end
+end
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -26,5 +26,58 @@ for (gemm, elty) in ((:dgemm_,:Float64), (:sgemm_,:Float32))
                   transA, transB, M, N, K,
                   alpha, A, lda, B, ldb, beta, C, ldc)
         end
+
+        ##  borrow BatchedRoutines.jl
+        # batched gemm for 3d-array
+        # C[:,:,i] := alpha*op(A[:,:,i])*op(B[:,:,i]) + beta*C[:,:,i], where:
+        # i is the specific batch number,
+        # op(X) is one of op(X) = X, or op(X) = XT, or op(X) = XH,
+        # alpha and beta are scalars,
+        # A, B and C are 3d Array:
+        # op(A) is an m-by-k-by-b 3d Array,
+        # op(B) is a k-by-n-by-b 3d Array,
+        # C is an m-by-n-by-b 3d Array.
+        function batched_gemm!(transA::AbstractChar,
+                               transB::AbstractChar,
+                               alpha::($elty),
+                               A::AbstractArray{$elty, 3},
+                               B::AbstractArray{$elty, 3},
+                               beta::($elty),
+                               C::AbstractArray{$elty, 3})
+            @assert !LinearAlgebra.BLAS.has_offset_axes(A, B, C)
+            @assert size(A, 3) == size(B, 3) == size(C, 3) "batch size mismatch"
+            m = size(A, transA == 'N' ? 1 : 2)
+            ka = size(A, transA == 'N' ? 2 : 1)
+            kb = size(B, transB == 'N' ? 1 : 2)
+            n = size(B, transB == 'N' ? 2 : 1)
+            if ka != kb || m != size(C,1) || n != size(C,2)
+                throw(DimensionMismatch("A has size ($m,$ka), B has size ($kb,$n), C has size $(size(C))"))
+            end
+            LinearAlgebra.BLAS.chkstride1(A)
+            LinearAlgebra.BLAS.chkstride1(B)
+            LinearAlgebra.BLAS.chkstride1(C)
+
+            ptrA = Base.unsafe_convert(Ptr{$elty}, A)
+            ptrB = Base.unsafe_convert(Ptr{$elty}, B)
+            ptrC = Base.unsafe_convert(Ptr{$elty}, C)
+
+            for k in 1:size(A, 3)
+                ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Cvoid,
+                    (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt},
+                     Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$elty}, Ptr{$elty}, Ref{LinearAlgebra.BLAS.BlasInt},
+                     Ptr{$elty}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$elty}, Ptr{$elty},
+                     Ref{LinearAlgebra.BLAS.BlasInt}),
+                     transA, transB, m, n,
+                     ka, alpha, ptrA, max(1,Base.stride(A,2)),
+                     ptrB, max(1,Base.stride(B,2)), beta, ptrC,
+                     max(1,Base.stride(C,2)))
+
+                ptrA += size(A, 1) * size(A, 2) * sizeof($elty)
+                ptrB += size(B, 1) * size(B, 2) * sizeof($elty)
+                ptrC += size(C, 1) * size(C, 2) * sizeof($elty)
+            end
+
+            C
+        end
     end
 end
diff --git a/test/batchedmul.jl b/test/batchedmul.jl
@@ -0,0 +1,22 @@
+function bmm_test(a,b; transA = false, transB = false)
+    bs = size(a,3)
+    transA && (a = permutedims(a, [2,1,3]))
+    transB && (b = permutedims(b, [2,1,3]))
+    c = []
+    for i = 1:bs
+        push!(c, a[:,:,i]*b[:,:,i])
+    end
+
+    cat(c...; dims = 3)
+end
+
+@testset "Batched Matrix Multiplication" begin
+    A = randn(7,5,3)
+    B = randn(5,7,3)
+    C = randn(7,6,3)
+
+    @test batchedmul(A, B) == bmm_test(A, B)
+    @test batchedmul(A, B; transA = true, transB = true) == bmm_test(A, B; transA = true, transB = true)
+    @test batchedmul(A, C; transA = true) == bmm_test(A, C; transA = true)
+    @test batchedmul(A, A; transB = true) == bmm_test(A, A; transB = true)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,6 +4,7 @@ using NNlib, Test
 
 include("activation.jl")
 include("conv.jl")
+include("batchedmul.jl")
 
 xs = [-100_000, -100_000.]
 @test softmax(xs) ≈ [0.5, 0.5]