JuliaLang
diff --git a/‎src/LinearAlgebra.jl‎
Lines changed: 20 additions & 13 deletions b/‎src/LinearAlgebra.jl‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎src/abstractq.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/abstractq.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/bidiag.jl‎
Lines changed: 48 additions & 38 deletions b/‎src/bidiag.jl‎
Lines changed: 48 additions & 38 deletions
diff --git a/‎src/blas.jl‎
Lines changed: 4 additions & 13 deletions b/‎src/blas.jl‎
Lines changed: 4 additions & 13 deletions
diff --git a/‎src/bunchkaufman.jl‎
Lines changed: 4 additions & 8 deletions b/‎src/bunchkaufman.jl‎
Lines changed: 4 additions & 8 deletions
@@ -13,7 +13,7 @@ import Base: USE_BLAS64, abs, acos, acosh, acot, acoth, acsc, acsch, adjoint, as
     copy, copyto!, copymutable, cos, cosh, cot, coth, csc, csch, eltype, exp, fill!, floor,
     getindex, hcat, getproperty, imag, inv, invpermuterows!, isapprox, isequal, isone, iszero,
     IndexStyle, kron, kron!, length, log, map, ndims, one, oneunit, parent, permutecols!,
-    permutedims, permuterows!, power_by_squaring, promote_rule, real, sec, sech, setindex!,
+    permutedims, permuterows!, power_by_squaring, promote_rule, real, isreal, sec, sech, setindex!,
     show, similar, sin, sincos, sinh, size, sqrt, strides, stride, tan, tanh, transpose, trunc,
     typed_hcat, vec, view, zero
 import Base: AbstractArray, AbstractMatrix, Array, Matrix
@@ -728,6 +728,8 @@ end
 (\)(F::TransposeFactorization{T,<:LU}, B::VecOrMat{Complex{T}}) where {T<:BlasReal} =
     ldiv(F, B)
 
+const default_peakflops_size = Int === Int32 ? 2048 : 4096
+
 """
     LinearAlgebra.peakflops(n::Integer=4096; eltype::DataType=Float64, ntrials::Integer=3, parallel::Bool=false)
 
@@ -752,10 +754,10 @@ of the problem that is solved on each processor.
     This function requires at least Julia 1.1. In Julia 1.0 it is available from
     the standard library `InteractiveUtils`.
 """
-function peakflops(n::Integer=4096; eltype::DataType=Float64, ntrials::Integer=3, parallel::Bool=false)
+function peakflops(n::Integer=default_peakflops_size; eltype::Type{ElType}=Float64, ntrials::Integer=3, parallel::Bool=false) where {ElType}
     t = zeros(Float64, ntrials)
     for i=1:ntrials
-        a = ones(eltype,n,n)
+        a = ones(ElType,n,n)
         t[i] = @elapsed a2 = a*a
         @assert a2[1,1] == n
     end
@@ -822,25 +824,30 @@ function versioninfo(io::IO=stdout)
     return nothing
 end
 
-function __init__()
-    try
-        verbose = parse(Bool, get(ENV, "LBT_VERBOSE", "false"))
-        BLAS.lbt_forward(OpenBLAS_jll.libopenblas_path; clear=true, verbose)
-        BLAS.check()
-    catch ex
-        Base.showerror_nostdio(ex, "WARNING: Error during initialization of module LinearAlgebra")
-    end
+function lbt_openblas_onload_callback()
+    # We don't use `BLAS.lbt_forward()` here because we don't want to take a lock on the config cache.
+    verbose = parse(Bool, get(ENV, "LBT_VERBOSE", "false"))
+    BLAS.lbt_forward_ccall(OpenBLAS_jll.libopenblas_path; clear=true, verbose)
+    BLAS.check()
+
     # register a hook to disable BLAS threading
     Base.at_disable_library_threading(() -> BLAS.set_num_threads(1))
 
     # https://github.com/xianyi/OpenBLAS/blob/c43ec53bdd00d9423fc609d7b7ecb35e7bf41b85/README.md#setting-the-number-of-threads-using-environment-variables
     if !haskey(ENV, "OPENBLAS_NUM_THREADS") && !haskey(ENV, "GOTO_NUM_THREADS") && !haskey(ENV, "OMP_NUM_THREADS")
         @static if Sys.isapple() && Base.BinaryPlatforms.arch(Base.BinaryPlatforms.HostPlatform()) == "aarch64"
-            BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint)))
+            nthreads = max(1, @ccall(jl_effective_threads()::Cint))
         else
-            BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint) ÷ 2))
+            nthreads = max(1, @ccall(jl_effective_threads()::Cint) ÷ 2)
         end
+        BLAS.lbt_set_num_threads(nthreads)
     end
 end
 
+function __init__()
+    # If users want to lazily load a different BLAS, they'd need to either change this call, or
+    # clear the datastructures modified by this call and call it again with their own.
+    libblastrampoline_jll.add_dependency!(OpenBLAS_jll, libopenblas, lbt_openblas_onload_callback)
+end
+
 end # module LinearAlgebra
@@ -9,6 +9,7 @@ end
 parent(adjQ::AdjointQ) = adjQ.Q
 eltype(::Type{<:AbstractQ{T}}) where {T} = T
 Base.eltypeof(Q::AbstractQ) = eltype(Q)
+Base.IteratorSize(::Type{<:AbstractQ}) = Base.HasShape{2}()
 ndims(::AbstractQ) = 2
 
 # inversion/adjoint/transpose
@@ -40,7 +41,6 @@ convert(::Type{AbstractQ{T}}, adjQ::AdjointQ{T}) where {T} = adjQ
 convert(::Type{AbstractQ{T}}, adjQ::AdjointQ) where {T} = convert(AbstractQ{T}, adjQ.Q)'
 
 # ... to matrix
-collect(Q::AbstractQ) = copyto!(Matrix{eltype(Q)}(undef, size(Q)), Q)
 Matrix{T}(Q::AbstractQ) where {T} = convert(Matrix{T}, Q*I) # generic fallback, yields square matrix
 Matrix{T}(adjQ::AdjointQ{S}) where {T,S} = convert(Matrix{T}, lmul!(adjQ, Matrix{S}(I, size(adjQ))))
 Matrix(Q::AbstractQ{T}) where {T} = Matrix{T}(Q)
 
@@ -189,10 +189,20 @@ end
 #Converting from Bidiagonal to dense Matrix
 function Matrix{T}(A::Bidiagonal) where T
     B = Matrix{T}(undef, size(A))
+    iszero(size(B,1)) && return B
     if haszero(T) # optimized path for types with zero(T) defined
         size(B,1) > 1 && fill!(B, zero(T))
-        copyto!(diagview(B), A.dv)
-        copyto!(diagview(B, _offdiagind(A.uplo)), A.ev)
+        isupper = A.uplo == 'U'
+        if isupper
+            B[1,1] = A.dv[1]
+        end
+        for col in axes(A.ev,1)
+            B[col+!isupper, col+isupper] = A.ev[col]
+            B[col+isupper, col+isupper] = A.dv[col+isupper]
+        end
+        if !isupper
+            B[end,end] = A.dv[end]
+        end
     else
         copyto!(B, A)
     end
@@ -286,6 +296,7 @@ axes(M::Bidiagonal) = (ax = axes(M.dv, 1); (ax, ax))
 for func in (:conj, :copy, :real, :imag)
     @eval ($func)(M::Bidiagonal) = Bidiagonal(($func)(M.dv), ($func)(M.ev), M.uplo)
 end
+isreal(M::Bidiagonal) = isreal(M.dv) && isreal(M.ev)
 
 adjoint(B::Bidiagonal{<:Number}) = Bidiagonal(vec(adjoint(B.dv)), vec(adjoint(B.ev)), B.uplo == 'U' ? :L : :U)
 adjoint(B::Bidiagonal{<:Number, <:Base.ReshapedArray{<:Number,1,<:Adjoint}}) =
@@ -454,8 +465,8 @@ function rmul!(B::Bidiagonal, x::Number)
         iszero(y) || throw(ArgumentError(LazyString(lazy"cannot set index ($row, $col) off ",
             lazy"the tridiagonal band to a nonzero value ($y)")))
     end
-    @. B.dv *= x
-    @. B.ev *= x
+    rmul!(B.dv, x)
+    rmul!(B.ev, x)
     return B
 end
 function lmul!(x::Number, B::Bidiagonal)
@@ -467,8 +478,8 @@ function lmul!(x::Number, B::Bidiagonal)
         iszero(y) || throw(ArgumentError(LazyString(lazy"cannot set index ($row, $col) off ",
             lazy"the tridiagonal band to a nonzero value ($y)")))
     end
-    @. B.dv = x * B.dv
-    @. B.ev = x * B.ev
+    lmul!(x, B.dv)
+    lmul!(x, B.ev)
     return B
 end
 /(A::Bidiagonal, B::Number) = Bidiagonal(A.dv/B, A.ev/B, A.uplo)
@@ -583,7 +594,7 @@ function _diag(A::Bidiagonal, k)
     elseif k == _offdiagind(A.uplo)
         return A.ev
     else
-        return diag(A, k)
+        return diagview(A, k)
     end
 end
 
@@ -952,11 +963,10 @@ function _mul!(C::AbstractVecOrMat, A::BiTriSym, B::AbstractVecOrMat, _add::MulA
     nB = size(B,2)
     (iszero(nA) || iszero(nB)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    if nA <= 3
-        # naive multiplication
-        for I in CartesianIndices(C)
-            col = Base.tail(Tuple(I))
-            _modify!(_add, sum(A[I[1], k] * B[k, col...] for k in axes(A,2)), C, I)
+    if nA == 1
+        A11 = @inbounds A[1,1]
+        for i in axes(B, 2)
+            @inbounds _modify!(_add, A11 * B[1,i], C, (1,i))
         end
         return C
     end
@@ -1189,25 +1199,25 @@ function _dibimul!(C::Bidiagonal, A::Diagonal, B::Bidiagonal, _add)
     C
 end
 
-function *(A::UpperOrUnitUpperTriangular, B::Bidiagonal)
+function mul(A::UpperOrUnitUpperTriangular, B::Bidiagonal)
     TS = promote_op(matprod, eltype(A), eltype(B))
     C = mul!(similar(A, TS, size(A)), A, B)
     return B.uplo == 'U' ? UpperTriangular(C) : C
 end
 
-function *(A::LowerOrUnitLowerTriangular, B::Bidiagonal)
+function mul(A::LowerOrUnitLowerTriangular, B::Bidiagonal)
     TS = promote_op(matprod, eltype(A), eltype(B))
     C = mul!(similar(A, TS, size(A)), A, B)
     return B.uplo == 'L' ? LowerTriangular(C) : C
 end
 
-function *(A::Bidiagonal, B::UpperOrUnitUpperTriangular)
+function mul(A::Bidiagonal, B::UpperOrUnitUpperTriangular)
     TS = promote_op(matprod, eltype(A), eltype(B))
     C = mul!(similar(B, TS, size(B)), A, B)
     return A.uplo == 'U' ? UpperTriangular(C) : C
 end
 
-function *(A::Bidiagonal, B::LowerOrUnitLowerTriangular)
+function mul(A::Bidiagonal, B::LowerOrUnitLowerTriangular)
     TS = promote_op(matprod, eltype(A), eltype(B))
     C = mul!(similar(B, TS, size(B)), A, B)
     return A.uplo == 'L' ? LowerTriangular(C) : C
@@ -1249,19 +1259,19 @@ end
 ldiv!(A::Bidiagonal, b::AbstractVecOrMat) = @inline ldiv!(b, A, b)
 function ldiv!(c::AbstractVecOrMat, A::Bidiagonal, b::AbstractVecOrMat)
     require_one_based_indexing(c, A, b)
-    N = size(A, 2)
+    N = size(A, 1)
     mb, nb = size(b, 1), size(b, 2)
     if N != mb
-        throw(DimensionMismatch(lazy"second dimension of A, $N, does not match first dimension of b, $mb"))
+        dimstr = b isa AbstractVector ? "length" : "first dimension"
+        throw(DimensionMismatch(LazyString(lazy"the first dimension of the Bidiagonal matrix, $N, ",
+            lazy"does not match the $dimstr of the right-hand-side, $mb")))
     end
     mc, nc = size(c, 1), size(c, 2)
     if mc != mb || nc != nb
-        throw(DimensionMismatch(lazy"size of result, ($mc, $nc), does not match the size of b, ($mb, $nb)"))
+        throw(DimensionMismatch(lazy"size of result, $(size(c)), does not match the size of b, $(size(b))"))
     end
 
-    if N == 0
-        return copyto!(c, b)
-    end
+    N == 0 && return c # in this case c and b are also empty
 
     zi = findfirst(iszero, A.dv)
     isnothing(zi) || throw(SingularException(zi))
@@ -1333,27 +1343,27 @@ function _rdiv!(C::AbstractMatrix, A::AbstractMatrix, B::Bidiagonal)
     isnothing(zi) || throw(SingularException(zi))
 
     if B.uplo == 'L'
-        diagB = B.dv[n]
-        for i in 1:m
-            C[i,n] = A[i,n] / diagB
+        diagB = @inbounds B.dv[n]
+        for i in axes(A,1)
+            @inbounds C[i,n] = A[i,n] / diagB
         end
-        for j in n-1:-1:1
-            diagB = B.dv[j]
-            offdiagB = B.ev[j]
-            for i in 1:m
-                C[i,j] = (A[i,j] - C[i,j+1]*offdiagB)/diagB
+        for j in reverse(axes(A,2)[1:end-1]) # n-1:-1:1
+            diagB = @inbounds B.dv[j]
+            offdiagB = @inbounds B.ev[j]
+            for i in axes(A,1)
+                @inbounds C[i,j] = (A[i,j] - C[i,j+1]*offdiagB)/diagB
             end
         end
     else
-        diagB = B.dv[1]
-        for i in 1:m
-            C[i,1] = A[i,1] / diagB
+        diagB = @inbounds B.dv[1]
+        for i in axes(A,1)
+            @inbounds C[i,1] = A[i,1] / diagB
         end
-        for j in 2:n
-            diagB = B.dv[j]
-            offdiagB = B.ev[j-1]
-            for i = 1:m
-                C[i,j] = (A[i,j] - C[i,j-1]*offdiagB)/diagB
+        for j in axes(A,2)[2:end]
+            diagB = @inbounds B.dv[j]
+            offdiagB = @inbounds B.ev[j-1]
+            for i in axes(A,1)
+                @inbounds C[i,j] = (A[i,j] - C[i,j-1]*offdiagB)/diagB
             end
         end
     end
 
@@ -84,8 +84,7 @@ export
     trsm!,
     trsm
 
-using ..LinearAlgebra: libblastrampoline, BlasReal, BlasComplex, BlasFloat, BlasInt,
-    DimensionMismatch, checksquare, chkstride1, SingularException
+using ..LinearAlgebra: libblastrampoline, BlasReal, BlasComplex, BlasFloat, BlasInt, DimensionMismatch, checksquare, chkstride1
 
 include("lbt.jl")
 
@@ -162,7 +161,9 @@ get_num_threads()::Int = lbt_get_num_threads()
 function check()
     # TODO: once we have bitfields of the BLAS functions that are actually forwarded,
     # ensure that we have a complete set here (warning on an incomplete BLAS implementation)
-    config = get_config()
+    # We don't use `get_config()` here because we are invoked in the onload callback and
+    # we don't want to take any locks.
+    config = LBTConfig(unsafe_load(ccall((:lbt_get_config, libblastrampoline), Ptr{lbt_config_t}, ())))
 
     # Ensure that one of our loaded libraries satisfies our interface requirement
     interface = USE_BLAS64 ? :ilp64 : :lp64
@@ -1378,11 +1379,6 @@ for (fname, elty) in ((:dtrsv_,:Float64),
                 throw(DimensionMismatch(lazy"size of A is $n != length(x) = $(length(x))"))
             end
             chkstride1(A)
-            if diag == 'N'
-                for i in 1:n
-                    iszero(A[i,i]) && throw(SingularException(i))
-                end
-            end
             px, stx = vec_pointer_stride(x, ArgumentError("input vector with 0 stride is not allowed"))
             GC.@preserve x ccall((@blasfunc($fname), libblastrampoline), Cvoid,
                 (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasInt},
@@ -2231,11 +2227,6 @@ for (mmname, smname, elty) in
             end
             chkstride1(A)
             chkstride1(B)
-            if diag == 'N'
-                for i in 1:k
-                    iszero(A[i,i]) && throw(SingularException(i))
-                end
-            end
             ccall((@blasfunc($smname), libblastrampoline), Cvoid,
                    (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8},
                     Ref{BlasInt}, Ref{BlasInt}, Ref{$elty}, Ptr{$elty},
 
@@ -182,10 +182,8 @@ julia> d, u, p = S; # destructuring via iteration
 julia> d == S.D && u == S.U && p == S.p
 true
 
-julia> S.U*S.D*S.U' - S.P*A*S.P'
-2×2 Matrix{Float64}:
- 0.0  0.0
- 0.0  0.0
+julia> S.U * S.D * S.U' ≈ S.P * A * S.P'
+true
 
 julia> S = bunchkaufman(Symmetric(A, :L))
 BunchKaufman{Float64, Matrix{Float64}, Vector{Int64}}
@@ -202,10 +200,8 @@ permutation:
  2
  1
 
-julia> S.L*S.D*S.L' - A[S.p, S.p]
-2×2 Matrix{Float64}:
- 0.0  0.0
- 0.0  0.0
+julia> S.L * S.D * S.L' ≈ A[S.p, S.p]
+true
 ```
 """
 bunchkaufman(A::AbstractMatrix{T}, rook::Bool=false; check::Bool = true) where {T} =