Use avx micro

YingboMa · YingboMa · commit 08a3dfa0cbff · 2020-05-07T02:41:40.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -5,8 +5,10 @@ version = "0.1.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 
 [compat]
+LoopVectorization = "0.7"
 julia = "1"
 
 [extras]
diff --git a/perf/lu.jl b/perf/lu.jl
@@ -1,25 +1,42 @@
 using BenchmarkTools
 import LinearAlgebra, RecursiveFactorization
 
-BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.08
 
 luflop(m, n) = n^3÷3 - n÷3 + m*n^2
 luflop(n) = luflop(n, n)
 
 bas_mflops = Float64[]
-rec_mflops = Float64[]
-ns = 50:50:800
+rec8_mflops = Float64[]
+rec16_mflops = Float64[]
+rec32_mflops = Float64[]
+ref_mflops = Float64[]
+ns = 4:32:500
 for n in ns
+    @info "$n × $n"
     A = rand(n, n)
     bt = @belapsed LinearAlgebra.lu!($(copy(A)))
-    rt = @belapsed RecursiveFactorization.lu!($(copy(A)))
     push!(bas_mflops, luflop(n)/bt/1e9)
-    push!(rec_mflops, luflop(n)/rt/1e9)
+
+    rt8 = @belapsed RecursiveFactorization.lu!($(copy(A)); blocksize=8)
+    push!(rec8_mflops, luflop(n)/rt8/1e9)
+
+    rt16 = @belapsed RecursiveFactorization.lu!($(copy(A)); blocksize=16)
+    push!(rec16_mflops, luflop(n)/rt16/1e9)
+
+    rt32 = @belapsed RecursiveFactorization.lu!($(copy(A)); blocksize=32)
+    push!(rec32_mflops, luflop(n)/rt32/1e9)
+
+    ref = @belapsed LinearAlgebra.generic_lufact!($(copy(A)))
+    push!(ref_mflops, luflop(n)/ref/1e9)
 end
 
 using Plots
 plt = plot(ns, bas_mflops, legend=:bottomright, lab="OpenBLAS", title="LU Factorization Benchmark", marker=:auto, dpi=150)
-plot!(plt, ns, rec_mflops, lab="RecursiveFactorization", marker=:auto)
+plot!(plt, ns, rec8_mflops, lab="RF8", marker=:auto)
+plot!(plt, ns, rec16_mflops, lab="RF16", marker=:auto)
+plot!(plt, ns, rec32_mflops, lab="RF32", marker=:auto)
+plot!(plt, ns, ref_mflops, lab="Reference", marker=:auto)
 xaxis!(plt, "size (N x N)")
 yaxis!(plt, "GFLOPS")
 savefig("lubench.png")
diff --git a/src/lu.jl b/src/lu.jl
@@ -1,23 +1,21 @@
-using LinearAlgebra: BlasInt, BlasFloat, LU, UnitLowerTriangular, ldiv!, BLAS, checknonsingular
+using LoopVectorization: @avx
+using LinearAlgebra: BlasInt, BlasFloat, LU, UnitLowerTriangular, ldiv!, mul!, checknonsingular
 
-function lu(A::AbstractMatrix, pivot::Union{Val{false}, Val{true}} = Val(true);
-            check::Bool = true, blocksize::Integer = 16)
-    lu!(copy(A), pivot; check = check, blocksize = blocksize)
+function lu(A::AbstractMatrix, pivot::Union{Val{false}, Val{true}} = Val(true); kwargs...)
+    lu!(copy(A), pivot; kwargs...)
 end
 
-function lu!(A, pivot::Union{Val{false}, Val{true}} = Val(true);
-             check::Bool = true, blocksize::Integer = 16)
-    lu!(A, Vector{BlasInt}(undef, min(size(A)...)), pivot;
-        check = check, blocksize = blocksize)
+function lu!(A, pivot::Union{Val{false}, Val{true}} = Val(true); kwargs...)
+    lu!(A, Vector{BlasInt}(undef, min(size(A)...)), pivot; kwargs...)
 end
 
 function lu!(A::AbstractMatrix{T}, ipiv::AbstractVector{<:Integer},
              pivot::Union{Val{false}, Val{true}} = Val(true);
-             check::Bool=true, blocksize::Integer=16) where T
+             check::Bool=true, blocksize::Integer=16, threshold::Integer=192) where T
     info = Ref(zero(BlasInt))
     m, n = size(A)
     mnmin = min(m, n)
-    if T <: BlasFloat && A isa StridedArray
+    if A isa StridedArray && mnmin > threshold
         reckernel!(A, pivot, m, mnmin, ipiv, info, blocksize)
         if m < n # fat matrix
             # [AL AR]
@@ -34,7 +32,7 @@ function lu!(A::AbstractMatrix{T}, ipiv::AbstractVector{<:Integer},
 end
 
 function nsplit(::Type{T}, n) where T
-    k = 128 ÷ sizeof(T)
+    k = 512 ÷ (isbitstype(T) ? sizeof(T) : 8)
     k_2 = k ÷ 2
     return n >= k ? ((n + k_2) ÷ k) * k_2 : n ÷ 2
 end
@@ -44,7 +42,9 @@ Base.@propagate_inbounds function apply_permutation!(P, A)
         i′ = P[i]
         i′ == i && continue
         @simd for j in axes(A, 2)
-            A[i, j], A[i′, j] = A[i′, j], A[i, j]
+            tmp = A[i, j]
+            A[i, j] = A[i′, j]
+            A[i′, j] = tmp
         end
     end
     nothing
@@ -98,7 +98,8 @@ function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, b
         # Schur complement:
         # We have A22 = L21 U12 + A′22, hence
         # A′22 = A22 - L21 U12
-        BLAS.gemm!('N', 'N', -one(T), A21, A12, one(T), A22)
+        #mul!(A22, A21, A12, -one(T), one(T))
+        schur_complement!(A22, A21, A12)
         # record info
         previnfo = info[]
         # P2 A22 = L22 U22
@@ -107,13 +108,23 @@ function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, b
         Pivot && apply_permutation!(P2, A21)
 
         info[] != previnfo && (info[] += n1)
-        @simd for i in 1:n2
+        @avx for i in 1:n2
             P2[i] += n1
         end
         return nothing
     end # inbounds
 end
 
+function schur_complement!(𝐂, 𝐀, 𝐁)
+    @avx for m ∈ 1:size(𝐀,1), n ∈ 1:size(𝐁,2)
+        𝐂ₘₙ = zero(eltype(𝐂))
+        for k ∈ 1:size(𝐀,2)
+            𝐂ₘₙ -= 𝐀[m,k] * 𝐁[k,n]
+        end
+        𝐂[m,n] = 𝐂ₘₙ + 𝐂[m,n]
+    end
+end
+
 #=
     Modified from https://github.com/JuliaLang/julia/blob/b56a9f07948255dfbe804eef25bdbada06ec2a57/stdlib/LinearAlgebra/src/lu.jl
     License is MIT: https://julialang.org/license
@@ -147,15 +158,15 @@ function _generic_lufact!(A, ::Val{Pivot}, ipiv, info) where Pivot
                 end
                 # Scale first column
                 Akkinv = inv(A[k,k])
-                @simd for i = k+1:m
+                @avx for i = k+1:m
                     A[i,k] *= Akkinv
                 end
             elseif info[] == 0
                 info[] = k
             end
             # Update the rest
-            for j = k+1:n
-                @simd for i = k+1:m
+            @avx for j = k+1:n
+                for i = k+1:m
                     A[i,j] -= A[i,k]*A[k,j]
                 end
             end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -11,24 +11,27 @@ const mylu = RecursiveFactorization.lu
 
 function testlu(A, MF, BF)
     @test MF.info == BF.info
-    @test norm(MF.L*MF.U - A[MF.p, :], Inf) < sqrt(eps(real(first(A))))
+    @test norm(MF.L*MF.U - A[MF.p, :], Inf) < 100sqrt(eps(real(one(float(first(A))))))
     nothing
 end
 
 @testset "Test LU factorization" begin
-    for p in (Val(true), Val(false)), T in (Float64, Float32, ComplexF64, ComplexF32, Real)
-        siz = (50, 100)
-        if isconcretetype(T)
-            A = rand(T, siz...)
-        else
-            _A = rand(50, 100)
-            A  = Matrix{T}(undef, siz...)
-            copyto!(A, _A)
-        end
-        MF = mylu(A, p)
-        BF = baselu(A, p)
-        testlu(A, MF, BF)
-        for i in 50:7:100 # test `MF.info`
+    for _p in (true, false), T in (Float64, Float32, ComplexF64, ComplexF32, Real)
+        p = Val(_p)
+        for s in [1:10; 50:80:200; 300]
+            siz = (s, s+2)
+            @info("size: $(siz[1]) × $(siz[2]), T = $T, p = $_p")
+            if isconcretetype(T)
+                A = rand(T, siz...)
+            else
+                _A = rand(siz...)
+                A  = Matrix{T}(undef, siz...)
+                copyto!(A, _A)
+            end
+            MF = mylu(A, p)
+            BF = baselu(A, p)
+            testlu(A, MF, BF)
+            i = rand(1:s) # test `MF.info`
             A[:, i] .= 0
             MF = mylu(A, p, check=false)
             BF = baselu(A, p, check=false)