diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..970e7600 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,34 @@ +env: + SECRET_CODECOV_TOKEN: "MH6hHjQi7vG2V1Yfotv5/z5Dkx1k5SdyGYlGTFXiQr22XksJgsXaBuvFKUrjC7JwcpBsOVU8103LuMKl3m7VJ35WzHZrOssYycVbdGcb2kloc6xvUOsN2R5BrhCQ4Pii0l6ZeVRjCnZVkcmb0Rf4glGFyfibCrqniry8RLhblsuFKFsijRK4OxiWYEs1IvUulN+ER8tEsEtw4+ZqC5nbLGMSnUG/saPkDQOVIBscvikbKEnBcCXBheGPktF+Y/cy/1Xa+FiBPoZcypwTeAjKG1g0MqyHXjaYekb/7fekaj+hukGaeJSCXxY8KEb2IZCh+Y36Tp6y6qsIp/AdtEnCpQ==;U2FsdGVkX18WQxvGLspPwzC4aDe+U7TXU+itebTbgh8LUkE6GukxxReHYiDZ6IrBiVvSGTVJMquW0c8KsOI1pw==" + +steps: + - label: "Julia v1" + plugins: + - JuliaCI/julia#v1: + version: "1" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 + +steps: + - label: "Julia LTS" + plugins: + - JuliaCI/julia#v1: + version: "1.10" # "lts" isn't valid + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + dirs: + - src + - ext + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 30 diff --git a/Project.toml b/Project.toml index ec99e515..efc996b4 100644 --- a/Project.toml +++ b/Project.toml @@ -8,14 +8,17 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] MatrixAlgebraKitChainRulesCoreExt = "ChainRulesCore" +MatrixAlgebraKitCUDAExt = "CUDA" [compat] Aqua = "0.6, 0.7, 0.8" ChainRulesCore = "1" ChainRulesTestUtils = "1" +CUDA = "5" JET = "0.9" LinearAlgebra = "1" SafeTestsets = "0.1" @@ -36,5 +39,4 @@ TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Aqua", "JET", "SafeTestsets", "Test", "TestExtras", "ChainRulesCore", - "ChainRulesTestUtils", "StableRNGs", "Zygote"] +test = ["Aqua", "JET", "SafeTestsets", "Test", "TestExtras","ChainRulesCore", "ChainRulesTestUtils", "StableRNGs", "Zygote", "CUDA"] diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl new file mode 100644 index 00000000..01a721f7 --- /dev/null +++ b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl @@ -0,0 +1,28 @@ +module MatrixAlgebraKitCUDAExt + +using MatrixAlgebraKit +using MatrixAlgebraKit: @algdef, Algorithm, check_input +using MatrixAlgebraKit: one!, zero!, uppertriangular!, lowertriangular! +using MatrixAlgebraKit: diagview, sign_safe +using MatrixAlgebraKit: LQViaTransposedQR +using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm +using CUDA +using LinearAlgebra +using LinearAlgebra: BlasFloat + +include("yacusolver.jl") +include("implementations/qr.jl") +include("implementations/svd.jl") + +function MatrixAlgebraKit.default_qr_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix} + return CUSOLVER_HouseholderQR(; kwargs...) +end +function MatrixAlgebraKit.default_lq_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix} + qr_alg = CUSOLVER_HouseholderQR(; kwargs...) + return LQViaTransposedQR(qr_alg) +end +function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix} + return CUSOLVER_QRIteration(; kwargs...) +end + +end diff --git a/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl b/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl new file mode 100644 index 00000000..636cf82e --- /dev/null +++ b/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl @@ -0,0 +1,69 @@ +# CUSOLVER QR implementation +function MatrixAlgebraKit.qr_full!(A::AbstractMatrix, QR, alg::CUSOLVER_HouseholderQR) + check_input(qr_full!, A, QR) + Q, R = QR + _cusolver_qr!(A, Q, R; alg.kwargs...) + return Q, R +end +function MatrixAlgebraKit.qr_compact!(A::AbstractMatrix, QR, alg::CUSOLVER_HouseholderQR) + check_input(qr_compact!, A, QR) + Q, R = QR + _cusolver_qr!(A, Q, R; alg.kwargs...) + return Q, R +end +function MatrixAlgebraKit.qr_null!(A::AbstractMatrix, N, alg::CUSOLVER_HouseholderQR) + check_input(qr_null!, A, N) + _cusolver_qr_null!(A, N; alg.kwargs...) + return N +end + +function _cusolver_qr!(A::AbstractMatrix, Q::AbstractMatrix, R::AbstractMatrix; + positive=false, blocksize=1) + blocksize > 1 && + throw(ArgumentError("CUSOLVER does not provide a blocked implementation for a QR decomposition")) + m, n = size(A) + minmn = min(m, n) + computeR = length(R) > 0 + inplaceQ = Q === A + if inplaceQ && (computeR || positive || m < n) + throw(ArgumentError("inplace Q only supported if matrix is tall (`m >= n`), R is not required and using `positive=false`")) + end + + A, τ = YACUSOLVER.geqrf!(A) + if inplaceQ + Q = YACUSOLVER.ungqr!(A, τ) + else + Q = YACUSOLVER.unmqr!('L', 'N', A, τ, one!(Q)) + end + # henceforth, τ is no longer needed and can be reused + + if positive # already fix Q even if we do not need R + # TODO: report that `lmul!` and `rmul!` with `Diagonal` don't work with CUDA + τ .= sign_safe.(diagview(A)) + Qf = view(Q, 1:m, 1:minmn) # first minmn columns of Q + Qf .= Qf .* transpose(τ) + end + + if computeR + R̃ = uppertriangular!(view(A, axes(R)...)) + if positive + R̃f = view(R̃, 1:minmn, 1:n) # first minmn rows of R + R̃f .= conj.(τ) .* R̃f + end + copyto!(R, R̃) + end + return Q, R +end + +function _cusolver_qr_null!(A::AbstractMatrix, N::AbstractMatrix; + positive=false, blocksize=1) + blocksize > 1 && + throw(ArgumentError("CUSOLVER does not provide a blocked implementation for a QR decomposition")) + m, n = size(A) + minmn = min(m, n) + fill!(N, zero(eltype(N))) + one!(view(N, (minmn + 1):m, 1:(m - minmn))) + A, τ = YACUSOLVER.geqrf!(A) + N = YACUSOLVER.unmqr!('L', 'N', A, τ, N) + return N +end diff --git a/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl b/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl new file mode 100644 index 00000000..f9976031 --- /dev/null +++ b/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl @@ -0,0 +1,108 @@ +const CUSOLVER_SVDAlgorithm = Union{CUSOLVER_QRIteration, + CUSOLVER_SVDPolar, + CUSOLVER_Jacobi} + +# CUSOLVER SVD implementation +function MatrixAlgebraKit.svd_full!(A::CuMatrix, USVᴴ, alg::CUSOLVER_SVDAlgorithm) + check_input(svd_full!, A, USVᴴ) + U, S, Vᴴ = USVᴴ + fill!(S, zero(eltype(S))) + m, n = size(A) + minmn = min(m, n) + if alg isa CUSOLVER_QRIteration + isempty(alg.kwargs) || + throw(ArgumentError("LAPACK_QRIteration does not accept any keyword arguments")) + YACUSOLVER.gesvd!(A, view(S, 1:minmn, 1), U, Vᴴ) + elseif alg isa CUSOLVER_SVDPolar + YACUSOLVER.Xgesvdp!(A, view(S, 1:minmn, 1), U, Vᴴ; alg.kwargs...) + elseif alg isa CUSOLVER_Jacobi + YACUSOLVER.gesvdj!(A, view(S, 1:minmn, 1), U, Vᴴ; alg.kwargs...) + # elseif alg isa LAPACK_Bisection + # throw(ArgumentError("LAPACK_Bisection is not supported for full SVD")) + # elseif alg isa LAPACK_Jacobi + # throw(ArgumentError("LAPACK_Bisection is not supported for full SVD")) + else + throw(ArgumentError("Unsupported SVD algorithm")) + end + diagview(S) .= view(S, 1:minmn, 1) + view(S, 2:minmn, 1) .= zero(eltype(S)) + # TODO: make this controllable using a `gaugefix` keyword argument + for j in 1:max(m, n) + if j <= minmn + u = view(U, :, j) + v = view(Vᴴ, j, :) + s = conj(sign(_argmaxabs(u))) + u .*= s + v .*= conj(s) + elseif j <= m + u = view(U, :, j) + s = conj(sign(_argmaxabs(u))) + u .*= s + else + v = view(Vᴴ, j, :) + s = conj(sign(_argmaxabs(v))) + v .*= s + end + end + return USVᴴ +end + +function MatrixAlgebraKit.svd_compact!(A::CuMatrix, USVᴴ, alg::CUSOLVER_SVDAlgorithm) + check_input(svd_compact!, A, USVᴴ) + U, S, Vᴴ = USVᴴ + if alg isa CUSOLVER_QRIteration + isempty(alg.kwargs) || + throw(ArgumentError("CUSOLVER_QRIteration does not accept any keyword arguments")) + YACUSOLVER.gesvd!(A, S.diag, U, Vᴴ) + elseif alg isa CUSOLVER_SVDPolar + YACUSOLVER.Xgesvdp!(A, S.diag, U, Vᴴ; alg.kwargs...) + elseif alg isa CUSOLVER_Jacobi + YACUSOLVER.gesvdj!(A, S.diag, U, Vᴴ; alg.kwargs...) + # elseif alg isa LAPACK_DivideAndConquer + # isempty(alg.kwargs) || + # throw(ArgumentError("LAPACK_DivideAndConquer does not accept any keyword arguments")) + # YALAPACK.gesdd!(A, S.diag, U, Vᴴ) + # elseif alg isa LAPACK_Bisection + # YALAPACK.gesvdx!(A, S.diag, U, Vᴴ; alg.kwargs...) + else + throw(ArgumentError("Unsupported SVD algorithm")) + end + # TODO: make this controllable using a `gaugefix` keyword argument + for j in 1:size(U, 2) + u = view(U, :, j) + v = view(Vᴴ, j, :) + s = conj(sign(_argmaxabs(u))) + u .*= s + v .*= conj(s) + end + return USVᴴ +end +_argmaxabs(x) = reduce(_largest, x; init=zero(eltype(x))) +_largest(x, y) = abs(x) < abs(y) ? y : x + +function MatrixAlgebraKit.svd_vals!(A::CuMatrix, S, alg::CUSOLVER_SVDAlgorithm) + check_input(svd_vals!, A, S) + U, Vᴴ = similar(A, (0, 0)), similar(A, (0, 0)) + if alg isa CUSOLVER_QRIteration + isempty(alg.kwargs) || + throw(ArgumentError("CUSOLVER_QRIteration does not accept any keyword arguments")) + YACUSOLVER.gesvd!(A, S, U, Vᴴ) + elseif alg isa CUSOLVER_SVDPolar + YACUSOLVER.Xgesvdp!(A, S, U, Vᴴ; alg.kwargs...) + elseif alg isa CUSOLVER_Jacobi + YACUSOLVER.gesvdj!(A, S, U, Vᴴ; alg.kwargs...) + # elseif alg isa LAPACK_DivideAndConquer + # isempty(alg.kwargs) || + # throw(ArgumentError("LAPACK_DivideAndConquer does not accept any keyword arguments")) + # YALAPACK.gesdd!(A, S, U, Vᴴ) + # elseif alg isa LAPACK_Bisection + # YALAPACK.gesvdx!(A, S, U, Vᴴ; alg.kwargs...) + # elseif alg isa LAPACK_Jacobi + # isempty(alg.kwargs) || + # throw(ArgumentError("LAPACK_Jacobi does not accept any keyword arguments")) + # YALAPACK.gesvj!(A, S, U, Vᴴ) + else + throw(ArgumentError("Unsupported SVD algorithm")) + end + return S +end \ No newline at end of file diff --git a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl new file mode 100644 index 00000000..b020d756 --- /dev/null +++ b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl @@ -0,0 +1,594 @@ +module YACUSOLVER + +using LinearAlgebra +using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_based_indexing +using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo + +using CUDA +using CUDA: @allowscalar +using CUDA.CUSOLVER + +# QR methods are implemented with full access to allocated arrays, so we do not need to redo this: +using CUDA.CUSOLVER: geqrf!, ormqr!, orgqr! +const unmqr! = ormqr! +const ungqr! = orgqr! + +# Wrapper for SVD via QR Iteration +for (bname, fname, elty, relty) in + ((:cusolverDnSgesvd_bufferSize, :cusolverDnSgesvd, :Float32, :Float32), + (:cusolverDnDgesvd_bufferSize, :cusolverDnDgesvd, :Float64, :Float64), + (:cusolverDnCgesvd_bufferSize, :cusolverDnCgesvd, :ComplexF32, :Float32), + (:cusolverDnZgesvd_bufferSize, :cusolverDnZgesvd, :ComplexF64, :Float64)) + @eval begin + #! format: off + function gesvd!(A::StridedCuMatrix{$elty}, + S::StridedCuVector{$relty}=similar(A, $relty, min(size(A)...)), + U::StridedCuMatrix{$elty}=similar(A, $elty, size(A, 1), min(size(A)...)), + Vᴴ::StridedCuMatrix{$elty}=similar(A, $elty, min(size(A)...), size(A, 2))) + #! format: on + chkstride1(A, U, Vᴴ, S) + m, n = size(A) + (m < n) && throw(ArgumentError("CUSOLVER's gesvd requires m ≥ n")) + minmn = min(m, n) + if length(U) == 0 + jobu = 'N' + else + size(U, 1) == m || + throw(DimensionMismatch("row size mismatch between A and U")) + if size(U, 2) == minmn + if U === A + jobu = 'O' + else + jobu = 'S' + end + elseif size(U, 2) == m + jobu = 'A' + else + throw(DimensionMismatch("invalid column size of U")) + end + end + if length(Vᴴ) == 0 + jobvt = 'N' + else + size(Vᴴ, 2) == n || + throw(DimensionMismatch("column size mismatch between A and Vᴴ")) + if size(Vᴴ, 1) == minmn + if Vᴴ === A + jobvt = 'O' + else + jobvt = 'S' + end + elseif size(Vᴴ, 1) == n + jobvt = 'A' + else + throw(DimensionMismatch("invalid row size of Vᴴ")) + end + end + length(S) == minmn || + throw(DimensionMismatch("length mismatch between A and S")) + + lda = max(1, stride(A, 2)) + ldu = max(1, stride(U, 2)) + ldv = max(1, stride(Vᴴ, 2)) + + dh = CUSOLVER.dense_handle() + function bufferSize() + out = Ref{Cint}(0) + CUSOLVER.$bname(dh, m, n, out) + return out[] * sizeof($elty) + end + rwork = CuArray{$relty}(undef, min(m, n) - 1) + CUDA.with_workspace(dh.workspace_gpu, bufferSize) do buffer + return CUSOLVER.$fname(dh, jobu, jobvt, m, n, + A, lda, S, U, ldu, Vᴴ, ldv, + buffer, sizeof(buffer) ÷ sizeof($elty), rwork, + dh.info) + end + CUDA.unsafe_free!(rwork) + + info = @allowscalar dh.info[1] + CUSOLVER.chkargsok(BlasInt(info)) + + return (S, U, Vᴴ) + end + end +end + +function Xgesvdp!(A::StridedCuMatrix{T}, + S::StridedCuVector=similar(A, real(T), min(size(A)...)), + U::StridedCuMatrix{T}=similar(A, T, size(A, 1), min(size(A)...)), + Vᴴ::StridedCuMatrix{T}=similar(A, T, min(size(A)...), size(A, 2)); + tol=norm(A) * eps(real(T))) where {T<:BlasFloat} + chkstride1(A, U, S, Vᴴ) + m, n = size(A) + minmn = min(m, n) + if length(U) == length(Vᴴ) == 0 + jobz = 'N' + econ = 1 + else + jobz = 'V' + size(U, 1) == m || + throw(DimensionMismatch("row size mismatch between A and U")) + size(Vᴴ, 2) == n || + throw(DimensionMismatch("column size mismatch between A and Vᴴ")) + if size(U, 2) == size(Vᴴ, 1) == minmn + econ = 1 + elseif size(U, 2) == m && size(Vᴴ, 1) == n + econ = 0 + else + throw(DimensionMismatch("invalid column size of U or row size of Vᴴ")) + end + end + R = eltype(S) + length(S) == minmn || + throw(DimensionMismatch("length mismatch between A and S")) + R == real(T) || + throw(ArgumentError("S does not have the matching real `eltype` of A")) + + Ṽ = similar(Vᴴ, (n, n)) + Ũ = (size(U) == (m, m)) ? U : similar(U, (m, m)) + lda = max(1, stride(A, 2)) + ldu = max(1, stride(Ũ, 2)) + ldv = max(1, stride(Ṽ, 2)) + h_err_sigma = Ref{Cdouble}(0) + params = CUSOLVER.CuSolverParameters() + dh = CUSOLVER.dense_handle() + + function bufferSize() + out_cpu = Ref{Csize_t}(0) + out_gpu = Ref{Csize_t}(0) + CUSOLVER.cusolverDnXgesvdp_bufferSize(dh, params, jobz, econ, m, n, + T, A, lda, R, S, T, Ũ, ldu, T, Ṽ, ldv, + T, out_gpu, out_cpu) + + return out_gpu[], out_cpu[] + end + CUSOLVER.with_workspaces(dh.workspace_gpu, dh.workspace_cpu, + bufferSize()...) do buffer_gpu, buffer_cpu + return CUSOLVER.cusolverDnXgesvdp(dh, params, jobz, econ, m, n, + T, A, lda, R, S, T, Ũ, ldu, T, Ṽ, ldv, + T, buffer_gpu, sizeof(buffer_gpu), + buffer_cpu, sizeof(buffer_cpu), + dh.info, h_err_sigma) + end + err = h_err_sigma[] + if err > tol + warn("Xgesvdp! did not attained requested tolerance: error = $err > tolerance = $tol") + end + + flag = @allowscalar dh.info[1] + CUSOLVER.chklapackerror(BlasInt(flag)) + if Ũ !== U && length(U) > 0 + U .= view(Ũ, 1:m, 1:size(U, 2)) + end + if length(Vᴴ) > 0 + Vᴴ .= view(Ṽ', 1:size(Vᴴ, 1), 1:n) + end + Ũ !== U && CUDA.unsafe_free!(Ũ) + CUDA.unsafe_free!(Ṽ) + + return S, U, Vᴴ +end + +# Wrapper for SVD via Jacobi +for (bname, fname, elty, relty) in + ((:cusolverDnSgesvdj_bufferSize, :cusolverDnSgesvdj, :Float32, :Float32), + (:cusolverDnDgesvdj_bufferSize, :cusolverDnDgesvdj, :Float64, :Float64), + (:cusolverDnCgesvdj_bufferSize, :cusolverDnCgesvdj, :ComplexF32, :Float32), + (:cusolverDnZgesvdj_bufferSize, :cusolverDnZgesvdj, :ComplexF64, :Float64)) + @eval begin + #! format: off + function gesvdj!(A::StridedCuMatrix{$elty}, + S::StridedCuVector{$relty}=similar(A, $relty, min(size(A)...)), + U::StridedCuMatrix{$elty}=similar(A, $elty, size(A, 1), min(size(A)...)), + Vᴴ::StridedCuMatrix{$elty}=similar(A, $elty, min(size(A)...), size(A, 2)); + tol::$relty=eps($relty), + max_sweeps::Int=100) + #! format: on + chkstride1(A, U, Vᴴ, S) + m, n = size(A) + minmn = min(m, n) + + if length(U) == 0 && length(Vᴴ) == 0 + jobz = 'N' + econ = 0 + else + jobz = 'V' + size(U, 1) == m || + throw(DimensionMismatch("row size mismatch between A and U")) + size(Vᴴ, 2) == n || + throw(DimensionMismatch("column size mismatch between A and Vᴴ")) + if size(U, 2) == size(Vᴴ, 1) == minmn + econ = 1 + elseif size(U, 2) == m && size(Vᴴ, 1) == n + econ = 0 + else + throw(DimensionMismatch("invalid column size of U or row size of Vᴴ")) + end + end + length(S) == minmn || + throw(DimensionMismatch("length mismatch between A and S")) + + Ṽ = (jobz == 'V') ? similar(Vᴴ') : similar(Vᴴ, (n, minmn)) + Ũ = (jobz == 'V') ? U : similar(U, (m, minmn)) + lda = max(1, stride(A, 2)) + ldu = max(1, stride(Ũ, 2)) + ldv = max(1, stride(Ṽ, 2)) + + params = Ref{CUSOLVER.gesvdjInfo_t}(C_NULL) + CUSOLVER.cusolverDnCreateGesvdjInfo(params) + CUSOLVER.cusolverDnXgesvdjSetTolerance(params[], tol) + CUSOLVER.cusolverDnXgesvdjSetMaxSweeps(params[], max_sweeps) + dh = CUSOLVER.dense_handle() + + function bufferSize() + out = Ref{Cint}(0) + CUSOLVER.$bname(dh, jobz, econ, m, n, A, lda, S, Ũ, ldu, Ṽ, ldv, + out, params[]) + return out[] * sizeof($elty) + end + + CUSOLVER.with_workspace(dh.workspace_gpu, bufferSize) do buffer + return CUSOLVER.$fname(dh, jobz, econ, m, n, A, lda, S, Ũ, ldu, Ṽ, ldv, + buffer, sizeof(buffer) ÷ sizeof($elty), dh.info, + params[]) + end + + info = @allowscalar dh.info[1] + CUSOLVER.chkargsok(BlasInt(info)) + + CUSOLVER.cusolverDnDestroyGesvdjInfo(params[]) + + if jobz == 'V' + adjoint!(Vᴴ, Ṽ) + end + return U, S, Vᴴ + end + end +end + +# for (jname, bname, fname, elty, relty) in +# ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32), +# (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64), +# (:hegvd!, :cusolverDnChegvd_bufferSize, :cusolverDnChegvd, :ComplexF32, :Float32), +# (:hegvd!, :cusolverDnZhegvd_bufferSize, :cusolverDnZhegvd, :ComplexF64, :Float64)) +# @eval begin +# function $jname(itype::Int, +# jobz::Char, +# uplo::Char, +# A::StridedCuMatrix{$elty}, +# B::StridedCuMatrix{$elty}) +# chkuplo(uplo) +# nA, nB = checksquare(A, B) +# if nB != nA +# throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!")) +# end +# n = nA +# lda = max(1, stride(A, 2)) +# ldb = max(1, stride(B, 2)) +# W = CuArray{$relty}(undef, n) +# dh = dense_handle() + +# function bufferSize() +# out = Ref{Cint}(0) +# $bname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W, out) +# return out[] * sizeof($elty) +# end + +# with_workspace(dh.workspace_gpu, bufferSize) do buffer +# return $fname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W, +# buffer, sizeof(buffer) ÷ sizeof($elty), dh.info) +# end + +# info = @allowscalar dh.info[1] +# chkargsok(BlasInt(info)) + +# if jobz == 'N' +# return W +# elseif jobz == 'V' +# return W, A, B +# end +# end +# end +# end + +# for (jname, bname, fname, elty, relty) in +# ((:sygvj!, :cusolverDnSsygvj_bufferSize, :cusolverDnSsygvj, :Float32, :Float32), +# (:sygvj!, :cusolverDnDsygvj_bufferSize, :cusolverDnDsygvj, :Float64, :Float64), +# (:hegvj!, :cusolverDnChegvj_bufferSize, :cusolverDnChegvj, :ComplexF32, :Float32), +# (:hegvj!, :cusolverDnZhegvj_bufferSize, :cusolverDnZhegvj, :ComplexF64, :Float64)) +# @eval begin +# function $jname(itype::Int, +# jobz::Char, +# uplo::Char, +# A::StridedCuMatrix{$elty}, +# B::StridedCuMatrix{$elty}; +# tol::$relty=eps($relty), +# max_sweeps::Int=100) +# chkuplo(uplo) +# nA, nB = checksquare(A, B) +# if nB != nA +# throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!")) +# end +# n = nA +# lda = max(1, stride(A, 2)) +# ldb = max(1, stride(B, 2)) +# W = CuArray{$relty}(undef, n) +# params = Ref{syevjInfo_t}(C_NULL) +# cusolverDnCreateSyevjInfo(params) +# cusolverDnXsyevjSetTolerance(params[], tol) +# cusolverDnXsyevjSetMaxSweeps(params[], max_sweeps) +# dh = dense_handle() + +# function bufferSize() +# out = Ref{Cint}(0) +# $bname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W, +# out, params[]) +# return out[] * sizeof($elty) +# end + +# with_workspace(dh.workspace_gpu, bufferSize) do buffer +# return $fname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W, +# buffer, sizeof(buffer) ÷ sizeof($elty), dh.info, params[]) +# end + +# info = @allowscalar dh.info[1] +# chkargsok(BlasInt(info)) + +# cusolverDnDestroySyevjInfo(params[]) + +# if jobz == 'N' +# return W +# elseif jobz == 'V' +# return W, A, B +# end +# end +# end +# end + +# for (jname, bname, fname, elty, relty) in +# ((:syevjBatched!, :cusolverDnSsyevjBatched_bufferSize, :cusolverDnSsyevjBatched, +# :Float32, :Float32), +# (:syevjBatched!, :cusolverDnDsyevjBatched_bufferSize, :cusolverDnDsyevjBatched, +# :Float64, :Float64), +# (:heevjBatched!, :cusolverDnCheevjBatched_bufferSize, :cusolverDnCheevjBatched, +# :ComplexF32, :Float32), +# (:heevjBatched!, :cusolverDnZheevjBatched_bufferSize, :cusolverDnZheevjBatched, +# :ComplexF64, :Float64)) +# @eval begin +# function $jname(jobz::Char, +# uplo::Char, +# A::StridedCuArray{$elty}; +# tol::$relty=eps($relty), +# max_sweeps::Int=100) + +# # Set up information for the solver arguments +# chkuplo(uplo) +# n = checksquare(A) +# lda = max(1, stride(A, 2)) +# batchSize = size(A, 3) +# W = CuArray{$relty}(undef, n, batchSize) +# params = Ref{syevjInfo_t}(C_NULL) + +# dh = dense_handle() +# resize!(dh.info, batchSize) + +# # Initialize the solver parameters +# cusolverDnCreateSyevjInfo(params) +# cusolverDnXsyevjSetTolerance(params[], tol) +# cusolverDnXsyevjSetMaxSweeps(params[], max_sweeps) + +# # Calculate the workspace size +# function bufferSize() +# out = Ref{Cint}(0) +# $bname(dh, jobz, uplo, n, A, lda, W, out, params[], batchSize) +# return out[] * sizeof($elty) +# end + +# # Run the solver +# with_workspace(dh.workspace_gpu, bufferSize) do buffer +# return $fname(dh, jobz, uplo, n, A, lda, W, buffer, +# sizeof(buffer) ÷ sizeof($elty), dh.info, params[], batchSize) +# end + +# # Copy the solver info and delete the device memory +# info = @allowscalar collect(dh.info) + +# # Double check the solver's exit status +# for i in 1:batchSize +# chkargsok(BlasInt(info[i])) +# end + +# cusolverDnDestroySyevjInfo(params[]) + +# # Return eigenvalues (in W) and possibly eigenvectors (in A) +# if jobz == 'N' +# return W +# elseif jobz == 'V' +# return W, A +# end +# end +# end +# end + +# for (fname, elty) in ((:cusolverDnSpotrsBatched, :Float32), +# (:cusolverDnDpotrsBatched, :Float64), +# (:cusolverDnCpotrsBatched, :ComplexF32), +# (:cusolverDnZpotrsBatched, :ComplexF64)) +# @eval begin +# function potrsBatched!(uplo::Char, +# A::Vector{<:StridedCuMatrix{$elty}}, +# B::Vector{<:StridedCuVecOrMat{$elty}}) +# if length(A) != length(B) +# throw(DimensionMismatch("")) +# end +# # Set up information for the solver arguments +# chkuplo(uplo) +# n = checksquare(A[1]) +# if size(B[1], 1) != n +# throw(DimensionMismatch("first dimension of B[i], $(size(B[1],1)), must match second dimension of A, $n")) +# end +# nrhs = size(B[1], 2) +# # cuSOLVER's Remark 1: only nrhs=1 is supported. +# if nrhs != 1 +# throw(ArgumentError("cuSOLVER only supports vectors for B")) +# end +# lda = max(1, stride(A[1], 2)) +# ldb = max(1, stride(B[1], 2)) +# batchSize = length(A) + +# Aptrs = unsafe_batch(A) +# Bptrs = unsafe_batch(B) + +# dh = dense_handle() + +# # Run the solver +# $fname(dh, uplo, n, nrhs, Aptrs, lda, Bptrs, ldb, dh.info, batchSize) + +# # Copy the solver info and delete the device memory +# info = @allowscalar dh.info[1] +# chklapackerror(BlasInt(info)) + +# return B +# end +# end +# end + +# for (fname, elty) in ((:cusolverDnSpotrfBatched, :Float32), +# (:cusolverDnDpotrfBatched, :Float64), +# (:cusolverDnCpotrfBatched, :ComplexF32), +# (:cusolverDnZpotrfBatched, :ComplexF64)) +# @eval begin +# function potrfBatched!(uplo::Char, A::Vector{<:StridedCuMatrix{$elty}}) + +# # Set up information for the solver arguments +# chkuplo(uplo) +# n = checksquare(A[1]) +# lda = max(1, stride(A[1], 2)) +# batchSize = length(A) + +# Aptrs = unsafe_batch(A) + +# dh = dense_handle() +# resize!(dh.info, batchSize) + +# # Run the solver +# $fname(dh, uplo, n, Aptrs, lda, dh.info, batchSize) + +# # Copy the solver info and delete the device memory +# info = @allowscalar collect(dh.info) + +# # Double check the solver's exit status +# for i in 1:batchSize +# chkargsok(BlasInt(info[i])) +# end + +# # info[i] > 0 means the leading minor of order info[i] is not positive definite +# # LinearAlgebra.LAPACK does not throw Exception here +# # to simplify calls to isposdef! and factorize +# return A, info +# end +# end +# end + +# # gesv +# function gesv!(X::CuVecOrMat{T}, A::CuMatrix{T}, B::CuVecOrMat{T}; fallback::Bool=true, +# residual_history::Bool=false, irs_precision::String="AUTO", +# refinement_solver::String="CLASSICAL", +# maxiters::Int=0, maxiters_inner::Int=0, tol::Float64=0.0, +# tol_inner=Float64 = 0.0) where {T<:BlasFloat} +# params = CuSolverIRSParameters() +# info = CuSolverIRSInformation() +# n = checksquare(A) +# nrhs = size(B, 2) +# lda = max(1, stride(A, 2)) +# ldb = max(1, stride(B, 2)) +# ldx = max(1, stride(X, 2)) +# niters = Ref{Cint}() +# dh = dense_handle() + +# if irs_precision == "AUTO" +# (T == Float32) && (irs_precision = "R_32F") +# (T == Float64) && (irs_precision = "R_64F") +# (T == ComplexF32) && (irs_precision = "C_32F") +# (T == ComplexF64) && (irs_precision = "C_64F") +# else +# (T == Float32) && (irs_precision ∈ ("R_32F", "R_16F", "R_16BF", "R_TF32") || +# error("$irs_precision is not supported.")) +# (T == Float64) && +# (irs_precision ∈ ("R_64F", "R_32F", "R_16F", "R_16BF", "R_TF32") || +# error("$irs_precision is not supported.")) +# (T == ComplexF32) && (irs_precision ∈ ("C_32F", "C_16F", "C_16BF", "C_TF32") || +# error("$irs_precision is not supported.")) +# (T == ComplexF64) && +# (irs_precision ∈ ("C_64F", "C_32F", "C_16F", "C_16BF", "C_TF32") || +# error("$irs_precision is not supported.")) +# end +# cusolverDnIRSParamsSetSolverMainPrecision(params, T) +# cusolverDnIRSParamsSetSolverLowestPrecision(params, irs_precision) +# cusolverDnIRSParamsSetRefinementSolver(params, refinement_solver) +# (tol != 0.0) && cusolverDnIRSParamsSetTol(params, tol) +# (tol_inner != 0.0) && cusolverDnIRSParamsSetTolInner(params, tol_inner) +# (maxiters != 0) && cusolverDnIRSParamsSetMaxIters(params, maxiters) +# (maxiters_inner != 0) && cusolverDnIRSParamsSetMaxItersInner(params, maxiters_inner) +# fallback ? cusolverDnIRSParamsEnableFallback(params) : +# cusolverDnIRSParamsDisableFallback(params) +# residual_history && cusolverDnIRSInfosRequestResidual(info) + +# function bufferSize() +# buffer_size = Ref{Csize_t}(0) +# cusolverDnIRSXgesv_bufferSize(dh, params, n, nrhs, buffer_size) +# return buffer_size[] +# end + +# with_workspace(dh.workspace_gpu, bufferSize) do buffer +# return cusolverDnIRSXgesv(dh, params, info, n, nrhs, A, lda, B, ldb, +# X, ldx, buffer, sizeof(buffer), niters, dh.info) +# end + +# # Copy the solver flag and delete the device memory +# flag = @allowscalar dh.info[1] +# chklapackerror(BlasInt(flag)) + +# return X, info +# end + +# for (jname, bname, fname, elty, relty) in +# ((:syevd!, :cusolverDnSsyevd_bufferSize, :cusolverDnSsyevd, :Float32, :Float32), +# (:syevd!, :cusolverDnDsyevd_bufferSize, :cusolverDnDsyevd, :Float64, :Float64), +# (:heevd!, :cusolverDnCheevd_bufferSize, :cusolverDnCheevd, :ComplexF32, :Float32), +# (:heevd!, :cusolverDnZheevd_bufferSize, :cusolverDnZheevd, :ComplexF64, :Float64)) +# @eval begin +# function $jname(jobz::Char, +# uplo::Char, +# A::StridedCuMatrix{$elty}) +# chkuplo(uplo) +# n = checksquare(A) +# lda = max(1, stride(A, 2)) +# W = CuArray{$relty}(undef, n) +# dh = dense_handle() + +# function bufferSize() +# out = Ref{Cint}(0) +# $bname(dh, jobz, uplo, n, A, lda, W, out) +# return out[] * sizeof($elty) +# end + +# with_workspace(dh.workspace_gpu, bufferSize) do buffer +# return $fname(dh, jobz, uplo, n, A, lda, W, +# buffer, sizeof(buffer) ÷ sizeof($elty), dh.info) +# end + +# info = @allowscalar dh.info[1] +# chkargsok(BlasInt(info)) + +# if jobz == 'N' +# return W +# elseif jobz == 'V' +# return W, A +# end +# end +# end +# end + +end \ No newline at end of file diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl index 567c1510..5f872bb5 100644 --- a/src/MatrixAlgebraKit.jl +++ b/src/MatrixAlgebraKit.jl @@ -2,12 +2,12 @@ module MatrixAlgebraKit using LinearAlgebra: LinearAlgebra using LinearAlgebra: norm # TODO: eleminate if we use VectorInterface.jl? -using LinearAlgebra: mul!, rmul!, lmul! +using LinearAlgebra: mul!, rmul!, lmul!, adjoint!, rdiv!, ldiv! using LinearAlgebra: sylvester using LinearAlgebra: isposdef, ishermitian using LinearAlgebra: Diagonal, diag, diagind using LinearAlgebra: UpperTriangular, LowerTriangular -using LinearAlgebra: BlasFloat, BlasReal, BlasComplex, BlasInt, triu!, tril!, rdiv!, ldiv! +using LinearAlgebra: BlasFloat, BlasReal, BlasComplex, BlasInt export isisometry, isunitary @@ -29,7 +29,9 @@ export left_orth!, right_orth!, left_null!, right_null! export LAPACK_HouseholderQR, LAPACK_HouseholderLQ, LAPACK_Simple, LAPACK_Expert, LAPACK_QRIteration, LAPACK_Bisection, LAPACK_MultipleRelativelyRobustRepresentations, - LAPACK_DivideAndConquer, LAPACK_Jacobi + LAPACK_DivideAndConquer, LAPACK_Jacobi, + LQViaTransposedQR, + CUSOLVER_HouseholderQR, CUSOLVER_QRIteration, CUSOLVER_SVDPolar, CUSOLVER_Jacobi export truncrank, trunctol, truncabove, TruncationKeepSorted, TruncationKeepFiltered VERSION >= v"1.11.0-DEV.469" && @@ -46,6 +48,7 @@ include("common/matrixproperties.jl") include("yalapack.jl") include("algorithms.jl") +include("interface/decompositions.jl") include("interface/qr.jl") include("interface/lq.jl") include("interface/svd.jl") @@ -55,7 +58,6 @@ include("interface/schur.jl") include("interface/polar.jl") include("interface/orthnull.jl") -include("implementations/decompositions.jl") include("implementations/truncation.jl") include("implementations/qr.jl") include("implementations/lq.jl") diff --git a/src/algorithms.jl b/src/algorithms.jl index af183103..7dbfe46e 100644 --- a/src/algorithms.jl +++ b/src/algorithms.jl @@ -146,7 +146,7 @@ macro algdef(name) return $name{typeof(kw)}(kw) end function Base.show(io::IO, alg::$name) - return _show_alg(io, alg) + return ($_show_alg)(io, alg) end Core.@__doc__ $name diff --git a/src/common/initialization.jl b/src/common/initialization.jl index 6a5cba78..b8bc5faf 100644 --- a/src/common/initialization.jl +++ b/src/common/initialization.jl @@ -1,11 +1,32 @@ # TODO: Consider using zerovector! if using VectorInterface.jl -function zero!(A::AbstractMatrix) +function zero!(A::AbstractArray) A .= zero(eltype(A)) return A end function one!(A::AbstractMatrix) length(A) > 0 || return A - copyto!(A, LinearAlgebra.I) + zero!(A) + diagview(A) .= one(eltype(A)) return A end + +function uppertriangular!(A::AbstractMatrix) + Base.require_one_based_indexing(A) + m, n = size(A) + for i in 1:n + r = (i + 1):m + zero!(view(A, r, i)) + end + return A +end + +function lowertriangular!(A::AbstractMatrix) + Base.require_one_based_indexing(A) + m, n = size(A) + for i in 2:n + r = 1:min(i - 1, m) + zero!(view(A, r, i)) + end + return A +end \ No newline at end of file diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl index 6a48e195..6d0efcca 100644 --- a/src/implementations/eig.jl +++ b/src/implementations/eig.jl @@ -30,14 +30,14 @@ end # Outputs # ------- -function initialize_output(::typeof(eig_full!), A::AbstractMatrix, ::LAPACK_EigAlgorithm) +function initialize_output(::typeof(eig_full!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later Tc = complex(eltype(A)) D = Diagonal(similar(A, Tc, n)) V = similar(A, Tc, (n, n)) return (D, V) end -function initialize_output(::typeof(eig_vals!), A::AbstractMatrix, ::LAPACK_EigAlgorithm) +function initialize_output(::typeof(eig_vals!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later Tc = complex(eltype(A)) D = similar(A, Tc, n) diff --git a/src/implementations/eigh.jl b/src/implementations/eigh.jl index a1c6c779..5178748e 100644 --- a/src/implementations/eigh.jl +++ b/src/implementations/eigh.jl @@ -29,13 +29,13 @@ end # Outputs # ------- -function initialize_output(::typeof(eigh_full!), A::AbstractMatrix, ::LAPACK_EighAlgorithm) +function initialize_output(::typeof(eigh_full!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later D = Diagonal(similar(A, real(eltype(A)), n)) V = similar(A, (n, n)) return (D, V) end -function initialize_output(::typeof(eigh_vals!), A::AbstractMatrix, ::LAPACK_EighAlgorithm) +function initialize_output(::typeof(eigh_vals!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later D = similar(A, real(eltype(A)), n) return D diff --git a/src/implementations/lq.jl b/src/implementations/lq.jl index 165c63b5..6c81ca12 100644 --- a/src/implementations/lq.jl +++ b/src/implementations/lq.jl @@ -42,20 +42,20 @@ end # Outputs # ------- -function initialize_output(::typeof(lq_full!), A::AbstractMatrix, ::LAPACK_HouseholderLQ) +function initialize_output(::typeof(lq_full!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) L = similar(A, (m, n)) Q = similar(A, (n, n)) return (L, Q) end -function initialize_output(::typeof(lq_compact!), A::AbstractMatrix, ::LAPACK_HouseholderLQ) +function initialize_output(::typeof(lq_compact!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) minmn = min(m, n) L = similar(A, (m, minmn)) Q = similar(A, (minmn, n)) return (L, Q) end -function initialize_output(::typeof(lq_null!), A::AbstractMatrix, ::LAPACK_HouseholderLQ) +function initialize_output(::typeof(lq_null!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) minmn = min(m, n) Nᴴ = similar(A, (n - minmn, n)) @@ -71,17 +71,34 @@ function lq_full!(A::AbstractMatrix, LQ, alg::LAPACK_HouseholderLQ) _lapack_lq!(A, L, Q; alg.kwargs...) return L, Q end +function lq_full!(A::AbstractMatrix, LQ, alg::LQViaTransposedQR) + check_input(lq_full!, A, LQ) + L, Q = LQ + lq_via_qr!(A, L, Q, alg.qr_alg) + return L, Q +end function lq_compact!(A::AbstractMatrix, LQ, alg::LAPACK_HouseholderLQ) check_input(lq_compact!, A, LQ) L, Q = LQ _lapack_lq!(A, L, Q; alg.kwargs...) return L, Q end +function lq_compact!(A::AbstractMatrix, LQ, alg::LQViaTransposedQR) + check_input(lq_compact!, A, LQ) + L, Q = LQ + lq_via_qr!(A, L, Q, alg.qr_alg) + return L, Q +end function lq_null!(A::AbstractMatrix, Nᴴ, alg::LAPACK_HouseholderLQ) check_input(lq_null!, A, Nᴴ) _lapack_lq_null!(A, Nᴴ; alg.kwargs...) return Nᴴ end +function lq_null!(A::AbstractMatrix, Nᴴ, alg::LQViaTransposedQR) + check_input(lq_null!, A, Nᴴ) + lq_null_via_qr!(A, Nᴴ, alg.qr_alg) + return Nᴴ +end function _lapack_lq!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix; positive=false, @@ -126,7 +143,7 @@ function _lapack_lq!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix; end if computeL - L̃ = tril!(view(A, axes(L)...)) + L̃ = lowertriangular!(view(A, axes(L)...)) if positive @inbounds for j in 1:minmn s = conj(sign_safe(L̃[j, j])) @@ -158,3 +175,31 @@ function _lapack_lq_null!(A::AbstractMatrix, Nᴴ::AbstractMatrix; end return Nᴴ end + +# LQ via transposition and QR +function lq_via_qr!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix, + qr_alg::AbstractAlgorithm) + m, n = size(A) + minmn = min(m, n) + At = adjoint!(similar(A'), A)::AbstractMatrix + Qt = (A === Q) ? At : similar(Q') + Lt = similar(L') + if size(Q) == (n, n) + Qt, Lt = qr_full!(At, (Qt, Lt), qr_alg) + else + Qt, Lt = qr_compact!(At, (Qt, Lt), qr_alg) + end + adjoint!(Q, Qt) + !isempty(L) && adjoint!(L, Lt) + return L, Q +end + +function lq_null_via_qr!(A::AbstractMatrix, N::AbstractMatrix, qr_alg::AbstractAlgorithm) + m, n = size(A) + minmn = min(m, n) + At = adjoint!(similar(A'), A)::AbstractMatrix + Nt = similar(N') + Nt = qr_null!(At, Nt, qr_alg) + !isempty(N) && adjoint!(N, Nt) + return N +end diff --git a/src/implementations/polar.jl b/src/implementations/polar.jl index 2604aab5..fabb261c 100644 --- a/src/implementations/polar.jl +++ b/src/implementations/polar.jl @@ -30,13 +30,13 @@ end # Outputs # ------- -function initialize_output(::typeof(left_polar!), A::AbstractMatrix, ::PolarViaSVD) +function initialize_output(::typeof(left_polar!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) W = similar(A) P = similar(A, (n, n)) return (W, P) end -function initialize_output(::typeof(right_polar!), A::AbstractMatrix, ::PolarViaSVD) +function initialize_output(::typeof(right_polar!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) P = similar(A, (m, m)) Wᴴ = similar(A) diff --git a/src/implementations/qr.jl b/src/implementations/qr.jl index 7e2e13ea..1d30b4b1 100644 --- a/src/implementations/qr.jl +++ b/src/implementations/qr.jl @@ -42,20 +42,20 @@ end # Outputs # ------- -function initialize_output(::typeof(qr_full!), A::AbstractMatrix, ::LAPACK_HouseholderQR) +function initialize_output(::typeof(qr_full!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) Q = similar(A, (m, m)) R = similar(A, (m, n)) return (Q, R) end -function initialize_output(::typeof(qr_compact!), A::AbstractMatrix, ::LAPACK_HouseholderQR) +function initialize_output(::typeof(qr_compact!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) minmn = min(m, n) Q = similar(A, (m, minmn)) R = similar(A, (minmn, n)) return (Q, R) end -function initialize_output(::typeof(qr_null!), A::AbstractMatrix, ::LAPACK_HouseholderQR) +function initialize_output(::typeof(qr_null!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) minmn = min(m, n) N = similar(A, (m, m - minmn)) @@ -130,7 +130,7 @@ function _lapack_qr!(A::AbstractMatrix, Q::AbstractMatrix, R::AbstractMatrix; end if computeR - R̃ = triu!(view(A, axes(R)...)) + R̃ = uppertriangular!(view(A, axes(R)...)) if positive @inbounds for j in n:-1:1 @simd for i in 1:min(minmn, j) diff --git a/src/implementations/schur.jl b/src/implementations/schur.jl index 55a1bdfa..541ae97f 100644 --- a/src/implementations/schur.jl +++ b/src/implementations/schur.jl @@ -28,13 +28,13 @@ end # Outputs # ------- -function initialize_output(::typeof(schur_full!), A::AbstractMatrix, ::LAPACK_EigAlgorithm) +function initialize_output(::typeof(schur_full!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later Z = similar(A, (n, n)) vals = similar(A, complex(eltype(A)), n) return (A, Z, vals) end -function initialize_output(::typeof(schur_vals!), A::AbstractMatrix, ::LAPACK_EigAlgorithm) +function initialize_output(::typeof(schur_vals!), A::AbstractMatrix, ::AbstractAlgorithm) n = size(A, 1) # square check will happen later vals = similar(A, complex(eltype(A)), n) return vals diff --git a/src/implementations/svd.jl b/src/implementations/svd.jl index a7d4c9e5..031494f8 100644 --- a/src/implementations/svd.jl +++ b/src/implementations/svd.jl @@ -44,14 +44,14 @@ end # Outputs # ------- -function initialize_output(::typeof(svd_full!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm) +function initialize_output(::typeof(svd_full!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) U = similar(A, (m, m)) S = similar(A, real(eltype(A)), (m, n)) # TODO: Rectangular diagonal type? Vᴴ = similar(A, (n, n)) return (U, S, Vᴴ) end -function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm) +function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::AbstractAlgorithm) m, n = size(A) minmn = min(m, n) U = similar(A, (m, minmn)) @@ -59,7 +59,7 @@ function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::LAPACK_S Vᴴ = similar(A, (minmn, n)) return (U, S, Vᴴ) end -function initialize_output(::typeof(svd_vals!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm) +function initialize_output(::typeof(svd_vals!), A::AbstractMatrix, ::AbstractAlgorithm) return similar(A, real(eltype(A)), (min(size(A)...),)) end function initialize_output(::typeof(svd_trunc!), A::AbstractMatrix, alg::TruncatedAlgorithm) diff --git a/src/implementations/decompositions.jl b/src/interface/decompositions.jl similarity index 74% rename from src/implementations/decompositions.jl rename to src/interface/decompositions.jl index d886588a..bff490a9 100644 --- a/src/implementations/decompositions.jl +++ b/src/interface/decompositions.jl @@ -1,8 +1,8 @@ # TODO: module Decompositions? -# ========== -# ALGORITHMS -# ========== +# ================= +# LAPACK ALGORITHMS +# ================= # reference for naming LAPACK algorithms: # https://www.netlib.org/lapack/explore-html/topics.html @@ -112,3 +112,41 @@ const LAPACK_SVDAlgorithm = Union{LAPACK_QRIteration, LAPACK_Bisection, LAPACK_DivideAndConquer, LAPACK_Jacobi} + +# ========================= +# CUSOLVER ALGORITHMS +# ========================= +""" + CUSOLVER_HouseholderQR(; positive = false) + +Algorithm type to denote the standard CUSOLVER algorithm for computing the QR decomposition of +a matrix using Householder reflectors. The keyword `positive=true` can be used to ensure that +the diagonal elements of `R` are non-negative. +""" +@algdef CUSOLVER_HouseholderQR + +""" + CUSOLVER_QRIteration() + +Algorithm type to denote the CUSOLVER driver for computing the eigenvalue decomposition of a +Hermitian matrix, or the singular value decomposition of a general matrix using the +QR Iteration algorithm. +""" +@algdef CUSOLVER_QRIteration + +""" + CUSOLVER_SVDPolar() + +Algorithm type to denote the CUSOLVER driver for computing the singular value decomposition of +a general matrix by using Halley's iterative algorithm to compute the polar decompositon, +followed by the hermitian eigenvalue decomposition of the positive definite factor. +""" +@algdef CUSOLVER_SVDPolar + +""" + CUSOLVER_Jacobi() + +Algorithm type to denote the CUSOLVER driver for computing the singular value decomposition of +a general matrix using the Jacobi algorithm. +""" +@algdef CUSOLVER_Jacobi diff --git a/src/interface/lq.jl b/src/interface/lq.jl index 6f1ed12f..338302ef 100644 --- a/src/interface/lq.jl +++ b/src/interface/lq.jl @@ -81,3 +81,13 @@ for f in (:lq_full!, :lq_compact!, :lq_null!) return default_lq_algorithm(A; kwargs...) end end + +# Alternative algorithm (necessary for CUDA) +struct LQViaTransposedQR{A<:AbstractAlgorithm} <: AbstractAlgorithm + qr_alg::A +end +function Base.show(io::IO, alg::LQViaTransposedQR) + print(io, "LQViaTransposedQR(") + _show_alg(io, alg.qr_alg) + return print(io, ")") +end diff --git a/test/cuda/lq.jl b/test/cuda/lq.jl new file mode 100644 index 00000000..cc1def1c --- /dev/null +++ b/test/cuda/lq.jl @@ -0,0 +1,117 @@ +using MatrixAlgebraKit +using MatrixAlgebraKit: diagview +using Test +using TestExtras +using StableRNGs +using CUDA + +include("utilities.jl") + +@testset "lq_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + for n in (37, m, 63) + minmn = min(m, n) + A = CuArray(randn(rng, T, m, n)) + L, Q = @constinferred lq_compact(A) + @test L isa CuMatrix{T} && size(L) == (m, minmn) + @test Q isa CuMatrix{T} && size(Q) == (minmn, n) + @test L * Q ≈ A + @test isapproxone(Q * Q') + Nᴴ = @constinferred lq_null(A) + @test Nᴴ isa CuMatrix{T} && size(Nᴴ) == (n - minmn, n) + @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3) + @test isapproxone(Nᴴ * Nᴴ') + + Ac = similar(A) + L2, Q2 = @constinferred lq_compact!(copy!(Ac, A), (L, Q)) + @test L2 === L + @test Q2 === Q + Nᴴ2 = @constinferred lq_null!(copy!(Ac, A), Nᴴ) + @test Nᴴ2 === Nᴴ + + # noL + noL = similar(A, 0, minmn) + Q2 = similar(Q) + lq_compact!(copy!(Ac, A), (noL, Q2)) + @test Q == Q2 + + # positive + lq_compact!(copy!(Ac, A), (L, Q); positive=true) + @test L * Q ≈ A + @test isapproxone(Q * Q') + @test all(>=(zero(real(T))), real(diagview(L))) + lq_compact!(copy!(Ac, A), (noL, Q2); positive=true) + @test Q == Q2 + + # explicit blocksize + lq_compact!(copy!(Ac, A), (L, Q); blocksize=1) + @test L * Q ≈ A + @test isapproxone(Q * Q') + lq_compact!(copy!(Ac, A), (noL, Q2); blocksize=1) + @test Q == Q2 + lq_null!(copy!(Ac, A), Nᴴ; blocksize=1) + @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3) + @test isapproxone(Nᴴ * Nᴴ') + if m <= n + lq_compact!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q + @test Q ≈ Q2 + # these do not work because of the in-place Q + @test_throws ArgumentError lq_compact!(copy!(Q2, A), (L, Q2); blocksize=1) + @test_throws ArgumentError lq_compact!(copy!(Q2, A), (noL, Q2); positive=true) + end + # no blocked CUDA + @test_throws ArgumentError lq_compact!(copy!(Q2, A), (L, Q2); blocksize=8) + end +end + +@testset "lq_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + for n in (37, m, 63) + minmn = min(m, n) + A = CuArray(randn(rng, T, m, n)) + L, Q = lq_full(A) + @test L isa CuMatrix{T} && size(L) == (m, n) + @test Q isa CuMatrix{T} && size(Q) == (n, n) + @test L * Q ≈ A + @test isapproxone(Q * Q') + + Ac = similar(A) + L2, Q2 = @constinferred lq_full!(copy!(Ac, A), (L, Q)) + @test L2 === L + @test Q2 === Q + @test L * Q ≈ A + @test isapproxone(Q * Q') + + # noL + noL = similar(A, 0, n) + Q2 = similar(Q) + lq_full!(copy!(Ac, A), (noL, Q2)) + @test Q == Q2 + + # positive + lq_full!(copy!(Ac, A), (L, Q); positive=true) + @test L * Q ≈ A + @test isapproxone(Q * Q') + @test all(>=(zero(real(T))), real(diagview(L))) + lq_full!(copy!(Ac, A), (noL, Q2); positive=true) + @test Q == Q2 + + # explicit blocksize + lq_full!(copy!(Ac, A), (L, Q); blocksize=1) + @test L * Q ≈ A + @test isapproxone(Q * Q') + lq_full!(copy!(Ac, A), (noL, Q2); blocksize=1) + @test Q == Q2 + if n == m + lq_full!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q + @test Q ≈ Q2 + # these do not work because of the in-place Q + @test_throws ArgumentError lq_full!(copy!(Q2, A), (L, Q2); blocksize=1) + @test_throws ArgumentError lq_full!(copy!(Q2, A), (noL, Q2); positive=true) + end + # no blocked CUDA + @test_throws ArgumentError lq_full!(copy!(Ac, A), (L, Q); blocksize=8) + end +end diff --git a/test/cuda/qr.jl b/test/cuda/qr.jl new file mode 100644 index 00000000..4ada4565 --- /dev/null +++ b/test/cuda/qr.jl @@ -0,0 +1,122 @@ +using MatrixAlgebraKit +using MatrixAlgebraKit: diagview +using Test +using TestExtras +using StableRNGs +using CUDA + +include("utilities.jl") + +@testset "qr_compact! and qr_null! for T = $T" for T in (Float32, Float64, ComplexF32, + ComplexF64) + rng = StableRNG(123) + m = 54 + for n in (37, m, 63) + minmn = min(m, n) + A = CuArray(randn(rng, T, m, n)) + Q, R = @constinferred qr_compact(A) + @test Q isa CuMatrix{T} && size(Q) == (m, minmn) + @test R isa CuMatrix{T} && size(R) == (minmn, n) + @test Q * R ≈ A + N = @constinferred qr_null(A) + @test N isa CuMatrix{T} && size(N) == (m, m - minmn) + @test isapproxone(Q' * Q) + @test maximum(abs, A' * N) < eps(real(T))^(2 / 3) + @test isapproxone(N' * N) + + Ac = similar(A) + Q2, R2 = @constinferred qr_compact!(copy!(Ac, A), (Q, R)) + @test Q2 === Q + @test R2 === R + N2 = @constinferred qr_null!(copy!(Ac, A), N) + @test N2 === N + + # noR + Q2 = similar(Q) + noR = similar(A, minmn, 0) + qr_compact!(copy!(Ac, A), (Q2, noR)) + @test Q == Q2 + + # positive + qr_compact!(copy!(Ac, A), (Q, R); positive=true) + @test Q * R ≈ A + @test isapproxone(Q' * Q) + @test all(>=(zero(real(T))), real(diagview(R))) + qr_compact!(copy!(Ac, A), (Q2, noR); positive=true) + @test Q == Q2 + + # explicit blocksize + qr_compact!(copy!(Ac, A), (Q, R); blocksize=1) + @test Q * R ≈ A + @test isapproxone(Q' * Q) + qr_compact!(copy!(Ac, A), (Q2, noR); blocksize=1) + @test Q == Q2 + qr_compact!(copy!(Ac, A), (Q2, noR); blocksize=1) + qr_null!(copy!(Ac, A), N; blocksize=1) + @test maximum(abs, A' * N) < eps(real(T))^(2 / 3) + @test isapproxone(N' * N) + if n <= m + qr_compact!(copy!(Q2, A), (Q2, noR); blocksize=1) # in-place Q + @test Q ≈ Q2 + # these do not work because of the in-place Q + @test_throws ArgumentError qr_compact!(copy!(Q2, A), (Q2, R2)) + @test_throws ArgumentError qr_compact!(copy!(Q2, A), (Q2, noR); positive=true) + end + # no blocked CUDA + @test_throws ArgumentError qr_compact!(copy!(Ac, A), (Q2, R); blocksize=8) + end +end + +@testset "qr_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 63 + for n in (37, m, 63) + minmn = min(m, n) + A = CuArray(randn(rng, T, m, n)) + Q, R = qr_full(A) + @test Q isa CuMatrix{T} && size(Q) == (m, m) + @test R isa CuMatrix{T} && size(R) == (m, n) + @test Q * R ≈ A + @test isapproxone(Q' * Q) + + Ac = similar(A) + Q2 = similar(Q) + noR = similar(A, m, 0) + Q2, R2 = @constinferred qr_full!(copy!(Ac, A), (Q, R)) + @test Q2 === Q + @test R2 === R + @test Q * R ≈ A + @test isapproxone(Q' * Q) + qr_full!(copy!(Ac, A), (Q2, noR)) + @test Q == Q2 + + # noR + noR = similar(A, m, 0) + Q2 = similar(Q) + qr_full!(copy!(Ac, A), (Q2, noR)) + @test Q == Q2 + + # positive + qr_full!(copy!(Ac, A), (Q, R); positive=true) + @test Q * R ≈ A + @test isapproxone(Q' * Q) + @test all(>=(zero(real(T))), real(diagview(R))) + qr_full!(copy!(Ac, A), (Q2, noR); positive=true) + @test Q == Q2 + + # explicit blocksize + qr_full!(copy!(Ac, A), (Q, R); blocksize=1) + @test Q * R ≈ A + @test isapproxone(Q' * Q) + qr_full!(copy!(Ac, A), (Q2, noR); blocksize=1) + @test Q == Q2 + if n == m + qr_full!(copy!(Q2, A), (Q2, noR); blocksize=1) # in-place Q + @test Q ≈ Q2 + @test_throws ArgumentError qr_full!(copy!(Q2, A), (Q2, R2)) + @test_throws ArgumentError qr_full!(copy!(Q2, A), (Q2, noR); positive=true) + end + # no blocked CUDA + @test_throws ArgumentError qr_full!(copy!(Ac, A), (Q, R); blocksize=8) + end +end diff --git a/test/cuda/svd.jl b/test/cuda/svd.jl new file mode 100644 index 00000000..66098c4b --- /dev/null +++ b/test/cuda/svd.jl @@ -0,0 +1,120 @@ +using MatrixAlgebraKit +using MatrixAlgebraKit: diagview +using LinearAlgebra: Diagonal, isposdef +using Test +using TestExtras +using StableRNGs +using CUDA + +include("utilities.jl") + +@testset "svd_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + @testset "size ($m, $n)" for n in (37, m, 63) + k = min(m, n) + algs = (CUSOLVER_QRIteration(), CUSOLVER_SVDPolar(), CUSOLVER_Jacobi()) + @testset "algorithm $alg" for alg in algs + n > m && alg isa CUSOLVER_QRIteration && continue # not supported + minmn = min(m, n) + A = CuArray(randn(rng, T, m, n)) + + U, S, Vᴴ = svd_compact(A; alg) + @test U isa CuMatrix{T} && size(U) == (m, minmn) + @test S isa Diagonal{real(T),<:CuVector} && size(S) == (minmn, minmn) + @test Vᴴ isa CuMatrix{T} && size(Vᴴ) == (minmn, n) + @test U * S * Vᴴ ≈ A + @test isapproxone(U' * U) + @test isapproxone(Vᴴ * Vᴴ') + @test isposdef(S) + + Ac = similar(A) + U2, S2, V2ᴴ = @constinferred svd_compact!(copy!(Ac, A), (U, S, Vᴴ), alg) + @test U2 === U + @test S2 === S + @test V2ᴴ === Vᴴ + @test U * S * Vᴴ ≈ A + @test isapproxone(U' * U) + @test isapproxone(Vᴴ * Vᴴ') + @test isposdef(S) + + Sd = svd_vals(A, alg) + @test CuArray(diagview(S)) ≈ Sd + # CuArray is necessary because norm of CuArray view with non-unit step is broken + end + end +end + +@testset "svd_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + @testset "size ($m, $n)" for n in (37, m, 63) + algs = (CUSOLVER_QRIteration(), CUSOLVER_SVDPolar(), CUSOLVER_Jacobi()) + @testset "algorithm $alg" for alg in algs + n > m && alg isa CUSOLVER_QRIteration && continue # not supported + A = CuArray(randn(rng, T, m, n)) + U, S, Vᴴ = svd_full(A; alg) + @test U isa CuMatrix{T} && size(U) == (m, m) + @test S isa CuMatrix{real(T)} && size(S) == (m, n) + @test Vᴴ isa CuMatrix{T} && size(Vᴴ) == (n, n) + @test U * S * Vᴴ ≈ A + @test isapproxone(U' * U) + @test isapproxone(U * U') + @test isapproxone(Vᴴ * Vᴴ') + @test isapproxone(Vᴴ' * Vᴴ) + @test all(isposdef, diagview(S)) + + Ac = similar(A) + U2, S2, V2ᴴ = @constinferred svd_full!(copy!(Ac, A), (U, S, Vᴴ), alg) + @test U2 === U + @test S2 === S + @test V2ᴴ === Vᴴ + @test U * S * Vᴴ ≈ A + @test isapproxone(U' * U) + @test isapproxone(U * U') + @test isapproxone(Vᴴ * Vᴴ') + @test isapproxone(Vᴴ' * Vᴴ) + @test all(isposdef, diagview(S)) + + Sc = similar(A, real(T), min(m, n)) + Sc2 = svd_vals!(copy!(Ac, A), Sc, alg) + @test Sc === Sc2 + @test CuArray(diagview(S)) ≈ Sc + # CuArray is necessary because norm of CuArray view with non-unit step is broken + end + end +end + +# @testset "svd_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) +# rng = StableRNG(123) +# m = 54 +# if LinearAlgebra.LAPACK.version() < v"3.12.0" +# algs = (LAPACK_DivideAndConquer(), LAPACK_QRIteration(), LAPACK_Bisection()) +# else +# algs = (LAPACK_DivideAndConquer(), LAPACK_QRIteration(), LAPACK_Bisection(), +# LAPACK_Jacobi()) +# end + +# @testset "size ($m, $n)" for n in (37, m, 63) +# @testset "algorithm $alg" for alg in algs +# n > m && alg isa LAPACK_Jacobi && continue # not supported +# A = randn(rng, T, m, n) +# S₀ = svd_vals(A) +# minmn = min(m, n) +# r = minmn - 2 + +# U1, S1, V1ᴴ = @constinferred svd_trunc(A; alg, trunc=truncrank(r)) +# @test length(S1.diag) == r +# @test LinearAlgebra.opnorm(A - U1 * S1 * V1ᴴ) ≈ S₀[r + 1] + +# s = 1 + sqrt(eps(real(T))) +# trunc2 = trunctol(s * S₀[r + 1]) + +# U2, S2, V2ᴴ = @constinferred svd_trunc(A; alg, trunc=trunctol(s * S₀[r + 1])) +# @test length(S2.diag) == r +# @test U1 ≈ U2 +# @test S1 ≈ S2 +# @test V1ᴴ ≈ V2ᴴ +# end +# end +# end diff --git a/test/cuda/utilities.jl b/test/cuda/utilities.jl new file mode 100644 index 00000000..61518b55 --- /dev/null +++ b/test/cuda/utilities.jl @@ -0,0 +1,3 @@ +function isapproxone(A) + return (size(A, 1) == size(A, 2)) && (A ≈ MatrixAlgebraKit.one!(similar(A))) +end diff --git a/test/lq.jl b/test/lq.jl index f12ea99a..581f1782 100644 --- a/test/lq.jl +++ b/test/lq.jl @@ -3,6 +3,7 @@ using Test using TestExtras using StableRNGs using LinearAlgebra: diag, I +using MatrixAlgebraKit: LQViaTransposedQR, LAPACK_HouseholderQR @testset "lq_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) rng = StableRNG(123) @@ -32,6 +33,19 @@ using LinearAlgebra: diag, I lq_compact!(copy!(Ac, A), (noL, Q2)) @test Q == Q2 + # Transposed QR algorithm + qr_alg = LAPACK_HouseholderQR() + lq_alg = LQViaTransposedQR(qr_alg) + L2, Q2 = @constinferred lq_compact!(copy!(Ac, A), (L, Q), lq_alg) + @test L2 === L + @test Q2 === Q + Nᴴ2 = @constinferred lq_null!(copy!(Ac, A), Nᴴ, lq_alg) + @test Nᴴ2 === Nᴴ + noL = similar(A, 0, minmn) + Q2 = similar(Q) + lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 + # unblocked algorithm lq_compact!(copy!(Ac, A), (L, Q); blocksize=1) @test L * Q ≈ A @@ -56,8 +70,22 @@ using LinearAlgebra: diag, I lq_null!(copy!(Ac, A), Nᴴ; blocksize=8) @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3) @test isisometry(Nᴴ; side=:right) + @test Nᴴ * Nᴴ' ≈ I + + qr_alg = LAPACK_HouseholderQR(; blocksize=1) + lq_alg = LQViaTransposedQR(qr_alg) + lq_compact!(copy!(Ac, A), (L, Q), lq_alg) + @test L * Q ≈ A + @test Q * Q' ≈ I + lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 + lq_null!(copy!(Ac, A), Nᴴ, lq_alg) + @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3) + @test Nᴴ * Nᴴ' ≈ I + # pivoted @test_throws ArgumentError lq_compact!(copy!(Ac, A), (L, Q); pivoted=true) + # positive lq_compact!(copy!(Ac, A), (L, Q); positive=true) @test L * Q ≈ A @@ -65,6 +93,7 @@ using LinearAlgebra: diag, I @test all(>=(zero(real(T))), real(diag(L))) lq_compact!(copy!(Ac, A), (noL, Q2); positive=true) @test Q == Q2 + # positive and blocksize 1 lq_compact!(copy!(Ac, A), (L, Q); positive=true, blocksize=1) @test L * Q ≈ A @@ -72,6 +101,14 @@ using LinearAlgebra: diag, I @test all(>=(zero(real(T))), real(diag(L))) lq_compact!(copy!(Ac, A), (noL, Q2); positive=true, blocksize=1) @test Q == Q2 + qr_alg = LAPACK_HouseholderQR(; positive=true, blocksize=1) + lq_alg = LQViaTransposedQR(qr_alg) + lq_compact!(copy!(Ac, A), (L, Q), lq_alg) + @test L * Q ≈ A + @test Q * Q' ≈ I + @test all(>=(zero(real(T))), real(diag(L))) + lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 end end @@ -99,6 +136,19 @@ end lq_full!(copy!(Ac, A), (noL, Q2)) @test Q == Q2 + # Transposed QR algorithm + qr_alg = LAPACK_HouseholderQR() + lq_alg = LQViaTransposedQR(qr_alg) + L2, Q2 = @constinferred lq_full!(copy!(Ac, A), (L, Q), lq_alg) + @test L2 === L + @test Q2 === Q + @test L * Q ≈ A + @test Q * Q' ≈ I + noL = similar(A, 0, n) + Q2 = similar(Q) + lq_full!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 + # unblocked algorithm lq_full!(copy!(Ac, A), (L, Q); blocksize=1) @test L * Q ≈ A @@ -109,7 +159,19 @@ end lq_full!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q @test Q ≈ Q2 end - # # other blocking + qr_alg = LAPACK_HouseholderQR(; blocksize=1) + lq_alg = LQViaTransposedQR(qr_alg) + lq_full!(copy!(Ac, A), (L, Q), lq_alg) + @test L * Q ≈ A + @test Q * Q' ≈ I + lq_full!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 + if n == m + lq_full!(copy!(Q2, A), (noL, Q2), lq_alg) # in-place Q + @test Q ≈ Q2 + end + + # other blocking lq_full!(copy!(Ac, A), (L, Q); blocksize=18) @test L * Q ≈ A @test isunitary(Q) @@ -124,6 +186,16 @@ end @test all(>=(zero(real(T))), real(diag(L))) lq_full!(copy!(Ac, A), (noL, Q2); positive=true) @test Q == Q2 + + qr_alg = LAPACK_HouseholderQR(; positive=true) + lq_alg = LQViaTransposedQR(qr_alg) + lq_full!(copy!(Ac, A), (L, Q), lq_alg) + @test L * Q ≈ A + @test Q * Q' ≈ I + @test all(>=(zero(real(T))), real(diag(L))) + lq_full!(copy!(Ac, A), (noL, Q2), lq_alg) + @test Q == Q2 + # positive and blocksize 1 lq_full!(copy!(Ac, A), (L, Q); positive=true, blocksize=1) @test L * Q ≈ A diff --git a/test/runtests.jl b/test/runtests.jl index 5f0d2991..13f33f09 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,46 +1,64 @@ using SafeTestsets -@safetestset "Algorithms" begin - include("algorithms.jl") -end -@safetestset "Truncate" begin - include("truncate.jl") -end -@safetestset "QR / LQ Decomposition" begin - include("qr.jl") - include("lq.jl") -end -@safetestset "Singular Value Decomposition" begin - include("svd.jl") -end -@safetestset "Hermitian Eigenvalue Decomposition" begin - include("eigh.jl") -end -@safetestset "General Eigenvalue Decomposition" begin - include("eig.jl") -end -@safetestset "Schur Decomposition" begin - include("schur.jl") -end -@safetestset "Polar Decomposition" begin - include("polar.jl") -end -@safetestset "Image and Null Space" begin - include("orthnull.jl") -end -@safetestset "ChainRules" begin - include("chainrules.jl") +# don't run all tests on GPU, only the GPU +# specific ones +is_buildkite = get(ENV, "BUILDKITE", "false") == "true" +if !is_buildkite + @safetestset "Algorithms" begin + include("algorithms.jl") + end + @safetestset "Truncate" begin + include("truncate.jl") + end + @safetestset "QR / LQ Decomposition" begin + include("qr.jl") + include("lq.jl") + end + @safetestset "Singular Value Decomposition" begin + include("svd.jl") + end + @safetestset "Hermitian Eigenvalue Decomposition" begin + include("eigh.jl") + end + @safetestset "General Eigenvalue Decomposition" begin + include("eig.jl") + end + @safetestset "Schur Decomposition" begin + include("schur.jl") + end + @safetestset "Polar Decomposition" begin + include("polar.jl") + end + @safetestset "Image and Null Space" begin + include("orthnull.jl") + end + @safetestset "ChainRules" begin + include("chainrules.jl") + end + @safetestset "MatrixAlgebraKit.jl" begin + @safetestset "Code quality (Aqua.jl)" begin + using MatrixAlgebraKit + using Aqua + Aqua.test_all(MatrixAlgebraKit) + end + @safetestset "Code linting (JET.jl)" begin + using MatrixAlgebraKit + using JET + JET.test_package(MatrixAlgebraKit; target_defined_modules=true) + end + end end -@safetestset "MatrixAlgebraKit.jl" begin - @safetestset "Code quality (Aqua.jl)" begin - using MatrixAlgebraKit - using Aqua - Aqua.test_all(MatrixAlgebraKit) +using CUDA +if CUDA.functional() + @safetestset "CUDA QR" begin + include("cuda/qr.jl") end - @safetestset "Code linting (JET.jl)" begin - using MatrixAlgebraKit - using JET - JET.test_package(MatrixAlgebraKit; target_defined_modules=true) + @safetestset "CUDA LQ" begin + include("cuda/lq.jl") + end + @safetestset "CUDA SVD" begin + include("cuda/svd.jl") end end +