TriangularSolve.jl for ldiv! (#28)

chriselrod · web-flow · commit 1143e93120c8 · 2021-07-31T23:38:57.000-04:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,10 +8,14 @@ on:
       - master
 jobs:
   test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.threads }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         julia-version: ['1', '^1.7.0-0']
+        threads:
+          - '1'
+          - '3'
         os: [ubuntu-latest, windows-latest, macOS-latest]
     steps:
       - uses: actions/checkout@v2
@@ -30,6 +34,8 @@ jobs:
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
+        env:
+          JULIA_NUM_THREADS: ${{ matrix.threads }}
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,20 @@
 name = "RecursiveFactorization"
 uuid = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
 authors = ["Yingbo Ma <mayingbo5@gmail.com>"]
-version = "0.1.13"
+version = "0.2.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
+StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
+TriangularSolve = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf"
 
 [compat]
 LoopVectorization = "0.10,0.11, 0.12"
+Polyester = "0.3.2"
+StrideArraysCore = "0.1.13"
+TriangularSolve = "0.1.1"
 julia = "1.5"
 
 [extras]
diff --git a/perf/lu.jl b/perf/lu.jl
@@ -1,6 +1,7 @@
 using BenchmarkTools, Random
-using LinearAlgebra, RecursiveFactorization
-
+using LinearAlgebra, RecursiveFactorization, VectorizationBase
+nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
+BLAS.set_num_threads(nc)
 BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
 
 function luflop(m, n=m; innerflop=2)
@@ -43,7 +44,12 @@ for n in ns
 end
 
 using DataFrames, VegaLite
-blaslib = BLAS.vendor() === :mkl ? :MKL : :OpenBLAS
+blaslib = if VERSION ≥ v"1.7.0-beta2"
+  config = BLAS.get_config().loaded_libs
+  occursin("libmkl_rt", config[1].libname) ? :MKL : :OpenBLAS
+else
+  BLAS.vendor() === :mkl ? :MKL : :OpenBLAS
+end
 df = DataFrame(Size = ns,
                Reference = ref_mflops)
 setproperty!(df, blaslib, bas_mflops)
@@ -60,7 +66,7 @@ plt = df |> @vlplot(
                     x = {:Size}, y = {:GFLOPS},
                     width = 1000, height = 600
                    )
-save(joinpath(homedir(), "Pictures", "lu_float64.png"), plt)
+save(joinpath(homedir(), "Pictures", "lu_float64_$(VERSION)_$(Sys.CPU_NAME)_$(nc)cores_$blaslib.png"), plt)
 
 #=
 using Plot
diff --git a/src/lu.jl b/src/lu.jl
@@ -1,5 +1,8 @@
 using LoopVectorization
-using LinearAlgebra: BlasInt, BlasFloat, LU, UnitLowerTriangular, ldiv!, checknonsingular, BLAS, LinearAlgebra
+using TriangularSolve: ldiv!
+using LinearAlgebra: BlasInt, BlasFloat, LU, UnitLowerTriangular, checknonsingular, BLAS, LinearAlgebra, Adjoint, Transpose
+using StrideArraysCore
+using Polyester: @batch
 
 # 1.7 compat
 normalize_pivot(t::Val{T}) where T = t
@@ -26,43 +29,40 @@ function lu!(A, pivot = Val(true); check=true, kwargs...)
     return F
 end
 
+for (f, T) in [(:adjoint, :Adjoint), (:transpose, :Transpose)], lu in (:lu, :lu!)
+  @eval $lu(A::$T, args...; kwargs...) = $f($lu(parent(A), args...; kwargs...))
+end
+
 const RECURSION_THRESHOLD = Ref(-1)
 
 # AVX512 needs a smaller recursion limit
 function pick_threshold()
     RECURSION_THRESHOLD[] >= 0 && return RECURSION_THRESHOLD[]
-    blasvendor = @static if VERSION >= v"1.7.0-DEV.610"
-        :openblas64
-    else
-        BLAS.vendor()
-    end
-    if blasvendor === :openblas || blasvendor === :openblas64
-        LoopVectorization.register_size() == 64 ? 110 : 72
-    else
-        LoopVectorization.register_size() == 64 ? 48 : 72
-    end
+    LoopVectorization.register_size() == 64 ? 48 : 40
 end
 
+recurse(::StridedArray) = true
+recurse(_) = false
+
 function lu!(
     A::AbstractMatrix{T}, ipiv::AbstractVector{<:Integer},
     pivot = Val(true);
     check::Bool=true,
-    # the performance is not sensitive wrt blocksize, and 16 is a good default
-    blocksize::Integer=16,
+    # the performance is not sensitive wrt blocksize, and 8 is a good default
+    blocksize::Integer=length(A) ≥ 40_000 ? 8 : 16,
     threshold::Integer=pick_threshold()
 ) where T
     pivot = normalize_pivot(pivot)
     info = zero(BlasInt)
     m, n = size(A)
     mnmin = min(m, n)
-    if A isa StridedArray && mnmin > threshold
-        info = reckernel!(A, pivot, m, mnmin, ipiv, info, blocksize)
-        if m < n # fat matrix
-            # [AL AR]
-            AL = @view A[:, 1:m]
-            AR = @view A[:, m+1:n]
-            apply_permutation!(ipiv, AR)
-            ldiv!(UnitLowerTriangular(AL), AR)
+    if recurse(A) && mnmin > threshold
+        if T <: Union{Float32,Float64}
+            GC.@preserve ipiv A begin
+                info = recurse!(PtrArray(A), pivot, m, n, mnmin, PtrArray(ipiv), info, blocksize)
+            end
+        else
+            info = recurse!(A, pivot, m, n, mnmin, ipiv, info, blocksize)
         end
     else # generic fallback
         info = _generic_lufact!(A, pivot, ipiv, info)
@@ -71,13 +71,41 @@ function lu!(
     LU{T, typeof(A)}(A, ipiv, info)
 end
 
-function nsplit(::Type{T}, n) where T
+@inline function recurse!(A, ::Val{Pivot}, m, n, mnmin, ipiv, info, blocksize) where {Pivot}
+  thread = length(A) * _sizeof(eltype(A)) > 0.92 * LoopVectorization.VectorizationBase.cache_size(Val(1))
+  info = reckernel!(A, Val(Pivot), m, mnmin, ipiv, info, blocksize, thread)
+  @inbounds if m < n # fat matrix
+    # [AL AR]
+    AL = @view A[:, 1:m]
+    AR = @view A[:, m+1:n]
+    apply_permutation!(ipiv, AR, thread)
+    ldiv!(UnitLowerTriangular(AL), AR)
+  end
+  info
+end
+
+@inline function nsplit(::Type{T}, n) where T
     k = 512 ÷ (isbitstype(T) ? sizeof(T) : 8)
     k_2 = k ÷ 2
     return n >= k ? ((n + k_2) ÷ k) * k_2 : n ÷ 2
 end
 
-Base.@propagate_inbounds function apply_permutation!(P, A)
+function apply_permutation_threaded!(P, A)
+    batchsize = cld(2000, length(P))
+    @batch minbatch=batchsize for j in axes(A, 2)
+        @inbounds @simd ivdep for i in axes(P, 1)
+            i′ = P[i]
+            tmp = A[i, j]
+            A[i, j] = A[i′, j]
+            A[i′, j] = tmp
+        end
+    end
+    nothing
+end
+_sizeof(::Type{T}) where {T} = Base.isbitstype(T) ? sizeof(T) : sizeof(Int)
+Base.@propagate_inbounds function apply_permutation!(P, A, thread)
+  thread && return apply_permutation_threaded!(P, A)
+    # length(A) * _sizeof(eltype(A)) > 0.92 * LoopVectorization.VectorizationBase.cache_size(Val(1)) && return apply_permutation_threaded!(P, A)
     for i in axes(P, 1)
         i′ = P[i]
         i′ == i && continue
@@ -90,10 +118,10 @@ Base.@propagate_inbounds function apply_permutation!(P, A)
     nothing
 end
 
-function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, blocksize)::BlasInt where {T,Pivot}
+function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, blocksize, thread)::BlasInt where {T,Pivot}
     @inbounds begin
         if n <= max(blocksize, 1)
-            info = _generic_lufact!(A, pivot, ipiv, info)
+            info = _generic_lufact!(A, Val(Pivot), ipiv, info)
             return info
         end
         n1 = nsplit(T, n)
@@ -128,11 +156,11 @@ function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, b
         #   [ A11 ]   [ L11 ]
         # P [     ] = [     ] U11
         #   [ A21 ]   [ L21 ]
-        info = reckernel!(AL, pivot, m, n1, P1, info, blocksize)
+        info = reckernel!(AL, Val(Pivot), m, n1, P1, info, blocksize, thread)
         # [ A12 ]    [ P1 ] [ A12 ]
         # [     ] <- [    ] [     ]
         # [ A22 ]    [ 0  ] [ A22 ]
-        Pivot && apply_permutation!(P1, AR)
+        Pivot && apply_permutation!(P1, AR, thread)
         # A12 = L11 U12  =>  U12 = L11 \ A12
         ldiv!(UnitLowerTriangular(A11), A12)
         # Schur complement:
@@ -143,9 +171,9 @@ function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, b
         # record info
         previnfo = info
         # P2 A22 = L22 U22
-        info = reckernel!(A22, pivot, m2, n2, P2, info, blocksize)
+        info = reckernel!(A22, Val(Pivot), m2, n2, P2, info, blocksize, thread)
         # A21 <- P2 A21
-        Pivot && apply_permutation!(P2, A21)
+        Pivot && apply_permutation!(P2, A21, thread)
 
         info != previnfo && (info += n1)
         @avx for i in 1:n2
@@ -156,7 +184,7 @@ function reckernel!(A::AbstractMatrix{T}, pivot::Val{Pivot}, m, n, ipiv, info, b
 end
 
 function schur_complement!(𝐂, 𝐀, 𝐁)
-    @avx for m ∈ 1:size(𝐀,1), n ∈ 1:size(𝐁,2)
+    @tturbo for m ∈ 1:size(𝐀,1), n ∈ 1:size(𝐁,2)
         𝐂ₘₙ = zero(eltype(𝐂))
         for k ∈ 1:size(𝐀,2)
             𝐂ₘₙ -= 𝐀[m,k] * 𝐁[k,n]
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,7 +1,7 @@
 using  Test
 import RecursiveFactorization
 import LinearAlgebra
-using  LinearAlgebra: norm
+using  LinearAlgebra: norm, Adjoint
 using  Random
 
 Random.seed!(12)
@@ -11,9 +11,10 @@ const mylu = RecursiveFactorization.lu
 
 function testlu(A, MF, BF)
     @test MF.info == BF.info
-    @test norm(MF.L*MF.U - A[MF.p, :], Inf) < 100sqrt(eps(real(one(float(first(A))))))
+    @test norm(MF.L*MF.U - A[MF.p, :], Inf) < length(A)*sqrt(eps(real(one(float(first(A))))))/16
     nothing
 end
+testlu(A::Adjoint, MF::Adjoint, BF) = testlu(parent(A), parent(MF), BF)
 
 @testset "Test LU factorization" begin
     for _p in (true, false), T in (Float64, Float32, ComplexF64, ComplexF32, Real)
@@ -32,6 +33,9 @@ end
             MF = mylu(A, p)
             BF = baselu(A, p)
             testlu(A, MF, BF)
+            A′ = permutedims(A)
+            MF′ = mylu(A′', p)
+            testlu(A′', MF′, BF)
             i = rand(1:s) # test `MF.info`
             A[:, i] .= 0
             MF = mylu(A, p, check=false)