diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl index d51560e4..c42e935b 100644 --- a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl +++ b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl @@ -5,8 +5,8 @@ using MatrixAlgebraKit: @algdef, Algorithm, check_input using MatrixAlgebraKit: one!, zero!, uppertriangular!, lowertriangular! using MatrixAlgebraKit: diagview, sign_safe using MatrixAlgebraKit: LQViaTransposedQR -using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm -import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj! +using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm, default_eig_algorithm +import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!, _gpu_geev! using CUDA using LinearAlgebra using LinearAlgebra: BlasFloat @@ -23,8 +23,12 @@ end function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix} return CUSOLVER_QRIteration(; kwargs...) end +function MatrixAlgebraKit.default_eig_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix} + return CUSOLVER_Simple(; kwargs...) +end +_gpu_geev!(A::StridedCuMatrix, D::StridedCuVector, V::StridedCuMatrix) = YACUSOLVER.Xgeev!(A, D, V) _gpu_geqrf!(A::StridedCuMatrix) = YACUSOLVER.geqrf!(A) _gpu_ungqr!(A::StridedCuMatrix, τ::StridedCuVector) = YACUSOLVER.ungqr!(A, τ) _gpu_unmqr!(side::AbstractChar, trans::AbstractChar, A::StridedCuMatrix, τ::StridedCuVector, C::StridedCuVecOrMat) = YACUSOLVER.unmqr!(side, trans, A, τ, C) diff --git a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl index d1838536..427eedb1 100644 --- a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl +++ b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl @@ -5,7 +5,7 @@ using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_ba using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo using CUDA -using CUDA: @allowscalar +using CUDA: @allowscalar, i32 using CUDA.CUSOLVER # QR methods are implemented with full access to allocated arrays, so we do not need to redo this: @@ -306,6 +306,73 @@ function Xgesvdr!(A::StridedCuMatrix{T}, return S, U, Vᴴ end +# Wrapper for general eigensolver +for (celty, elty) in ((:ComplexF32, :Float32), (:ComplexF64, :Float64), (:ComplexF32, :ComplexF32), (:ComplexF64, :ComplexF64)) + @eval begin + function Xgeev!(A::StridedCuMatrix{$elty}, D::StridedCuVector{$celty}, V::StridedCuMatrix{$celty}) + require_one_based_indexing(A, V, D) + chkstride1(A, V, D) + n = checksquare(A) + # TODO GPU appropriate version + #chkfinite(A) # balancing routines don't support NaNs and Infs + n == length(D) || throw(DimensionMismatch("length mismatch between A and D")) + if length(V) == 0 + jobvr = 'N' + elseif length(V) == n*n + jobvr = 'V' + else + throw(DimensionMismatch("size of VR must match size of A")) + end + jobvl = 'N' # required by API for now (https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdnxgeev) + #=if length(VL) == 0 + jobvl = 'N' + elseif length(VL) == n*n + jobvl = 'V' + else + throw(DimensionMismatch("size of VL must match size of A")) + end=# + VL = similar(A, n, 0) + lda = max(1, stride(A, 2)) + ldvl = max(1, stride(VL, 2)) + params = CUSOLVER.CuSolverParameters() + dh = CUSOLVER.dense_handle() + + if $elty <: Real + D2 = reinterpret($elty, D) + # reuse memory, we will have to reorder afterwards to bring real and imaginary + # components in the order as required for the Complex type + VR = reinterpret($elty, V) + else + D2 = D + VR = V + end + ldvr = max(1, stride(VR, 2)) + function bufferSize() + out_cpu = Ref{Csize_t}(0) + out_gpu = Ref{Csize_t}(0) + CUSOLVER.cusolverDnXgeev_bufferSize(dh, params, jobvl, jobvr, n, $elty, A, + lda, $elty, D2, $elty, VL, ldvl, $elty, VR, ldvr, + $elty, out_gpu, out_cpu) + out_gpu[], out_cpu[] + end + CUDA.with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu + CUSOLVER.cusolverDnXgeev(dh, params, jobvl, jobvr, n, $elty, A, lda, $elty, + D2, $elty, VL, ldvl, $elty, VR, ldvr, $elty, buffer_gpu, + sizeof(buffer_gpu), buffer_cpu, sizeof(buffer_cpu), dh.info) + end + flag = @allowscalar dh.info[1] + CUSOLVER.chkargsok(BlasInt(flag)) + if eltype(A) <: Real + work = CuVector{$elty}(undef, n) + DR = view(D2, 1:n) + DI = view(D2, (n + 1):(2n)) + _reorder_realeigendecomposition!(D, DR, DI, work, VR, jobvr) + end + return D, V + end + end +end + # for (jname, bname, fname, elty, relty) in # ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32), # (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64), @@ -650,4 +717,57 @@ end # end # end +# device code is unreachable by coverage right now +# COV_EXCL_START +# TODO use a shmem array here +function _reorder_kernel_real(real_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T} + grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x + @inbounds if grid_idx <= length(real_ev_ixs) + i = real_ev_ixs[grid_idx] + for j in n:-1:1 + VR[2 * j, i] = zero(T) + VR[2 * j - 1, i] = VR[j, i] + end + end + return +end + +function _reorder_kernel_complex(complex_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T} + grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x + @inbounds if grid_idx <= length(complex_ev_ixs) + i = complex_ev_ixs[grid_idx] + for j in n:-1:1 + VR[2 * j, i] = VR[j, i + 1] + VR[2 * j - 1, i] = VR[j, i] + VR[2 * j, i + 1] = -VR[j, i + 1] + VR[2 * j - 1, i + 1] = VR[j, i] + end + end + return +end +# COV_EXCL_STOP + +function _reorder_realeigendecomposition!(W, WR, WI, work, VR, jobvr) +# first reorder eigenvalues and recycle work as temporary buffer to efficiently implement the permutation + copy!(work, WI) + n = size(W, 1) + @. W[1:n] = WR[1:n] + im * work[1:n] + T = eltype(WR) + if jobvr == 'V' # also reorganise vectors + real_ev_ixs = findall(isreal, W) + _cmplx_ev_ixs = findall(!isreal, W) # these come in pairs, choose only the first of each pair + complex_ev_ixs = view(_cmplx_ev_ixs, 1:2:length(_cmplx_ev_ixs)) + if !isempty(real_ev_ixs) + real_threads = 128 + real_blocks = max(1, div(length(real_ev_ixs), real_threads)) + @cuda threads=real_threads blocks=real_blocks _reorder_kernel_real(real_ev_ixs, VR, n) + end + if !isempty(complex_ev_ixs) + complex_threads = 128 + complex_blocks = max(1, div(length(complex_ev_ixs), complex_threads)) + @cuda threads=complex_threads blocks=complex_blocks _reorder_kernel_complex(complex_ev_ixs, VR, n) + end + end +end + end diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl index d074a16e..035caf11 100644 --- a/src/MatrixAlgebraKit.jl +++ b/src/MatrixAlgebraKit.jl @@ -33,6 +33,7 @@ export LAPACK_HouseholderQR, LAPACK_HouseholderLQ, LAPACK_QRIteration, LAPACK_Bisection, LAPACK_MultipleRelativelyRobustRepresentations, LAPACK_DivideAndConquer, LAPACK_Jacobi, LQViaTransposedQR, + CUSOLVER_Simple, CUSOLVER_HouseholderQR, CUSOLVER_QRIteration, CUSOLVER_SVDPolar, CUSOLVER_Jacobi, CUSOLVER_Randomized, ROCSOLVER_HouseholderQR, ROCSOLVER_QRIteration, ROCSOLVER_Jacobi export truncrank, trunctol, truncabove, TruncationKeepSorted, TruncationKeepFiltered diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl index 51b29145..515e7386 100644 --- a/src/implementations/eig.jl +++ b/src/implementations/eig.jl @@ -82,3 +82,29 @@ function eig_trunc!(A::AbstractMatrix, DV, alg::TruncatedAlgorithm) D, V = eig_full!(A, DV, alg.alg) return truncate!(eig_trunc!, (D, V), alg.trunc) end + +_gpu_geev!(A::AbstractMatrix, D, V) = throw(MethodError(_gpu_geev!, (A, D, V))) + +function eig_full!(A::AbstractMatrix, DV, alg::GPU_EigAlgorithm) + check_input(eig_full!, A, DV, alg) + D, V = DV + if alg isa GPU_Simple + isempty(alg.kwargs) || + throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments")) + _gpu_geev!(A, D.diag, V) + end + # TODO: make this controllable using a `gaugefix` keyword argument + V = gaugefix!(V) + return D, V +end + +function eig_vals!(A::AbstractMatrix, D, alg::GPU_EigAlgorithm) + check_input(eig_vals!, A, D, alg) + V = similar(A, complex(eltype(A)), (size(A, 1), 0)) + if alg isa GPU_Simple + isempty(alg.kwargs) || + throw(ArgumentError("LAPACK_Simple (geev) does not accept any keyword arguments")) + _gpu_geev!(A, D, V) + end + return D +end diff --git a/src/interface/decompositions.jl b/src/interface/decompositions.jl index e11c6bfa..1310944e 100644 --- a/src/interface/decompositions.jl +++ b/src/interface/decompositions.jl @@ -164,6 +164,15 @@ a general matrix using the randomized SVD algorithm. """ @algdef CUSOLVER_Randomized +""" + CUSOLVER_Simple() + +Algorithm type to denote the simple CUSOLVER driver for computing the non-Hermitian +eigenvalue decomposition of a matrix. +""" +@algdef CUSOLVER_Simple + +const CUSOLVER_EigAlgorithm = Union{CUSOLVER_Simple} # ========================= # ROCSOLVER ALGORITHMS # ========================= @@ -192,3 +201,6 @@ Algorithm type to denote the ROCSOLVER driver for computing the singular value d a general matrix using the Jacobi algorithm. """ @algdef ROCSOLVER_Jacobi + +const GPU_Simple = Union{CUSOLVER_Simple} +const GPU_EigAlgorithm = Union{GPU_Simple} diff --git a/test/cuda/eig.jl b/test/cuda/eig.jl new file mode 100644 index 00000000..2ec8da22 --- /dev/null +++ b/test/cuda/eig.jl @@ -0,0 +1,63 @@ +using MatrixAlgebraKit +using LinearAlgebra: Diagonal +using Test +using TestExtras +using StableRNGs +using CUDA + +include(joinpath("..", "utilities.jl")) + +@testset "eig_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + for alg in (CUSOLVER_Simple(), :CUSOLVER_Simple, CUSOLVER_Simple) + A = CuArray(randn(rng, T, m, m)) + Tc = complex(T) + + D, V = @constinferred eig_full(A; alg=($alg)) + @test eltype(D) == eltype(V) == Tc + @test A * V ≈ V * D + + alg′ = @constinferred MatrixAlgebraKit.select_algorithm(eig_full!, A, $alg) + + Ac = similar(A) + D2, V2 = @constinferred eig_full!(copy!(Ac, A), (D, V), alg′) + @test D2 === D + @test V2 === V + @test A * V ≈ V * D + + Dc = @constinferred eig_vals(A, alg′) + @test eltype(Dc) == Tc + @test parent(D) ≈ Dc + end +end + +#= +@testset "eig_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + rng = StableRNG(123) + m = 54 + for alg in (CUSOLVER_Simple(),) + A = CuArray(randn(rng, T, m, m)) + A *= A' # TODO: deal with eigenvalue ordering etc + # eigenvalues are sorted by ascending real component... + D₀ = sort!(eig_vals(A); by=abs, rev=true) + rmin = findfirst(i -> abs(D₀[end - i]) != abs(D₀[end - i - 1]), 1:(m - 2)) + r = length(D₀) - rmin + + D1, V1 = @constinferred eig_trunc(A; alg, trunc=truncrank(r)) + @test length(D1.diag) == r + @test A * V1 ≈ V1 * D1 + + s = 1 + sqrt(eps(real(T))) + trunc = trunctol(s * abs(D₀[r + 1])) + D2, V2 = @constinferred eig_trunc(A; alg, trunc) + @test length(diagview(D2)) == r + @test A * V2 ≈ V2 * D2 + + # trunctol keeps order, truncrank might not + # test for same subspace + @test V1 * ((V1' * V1) \ (V1' * V2)) ≈ V2 + @test V2 * ((V2' * V2) \ (V2' * V1)) ≈ V1 + end +end +=# diff --git a/test/runtests.jl b/test/runtests.jl index d0414e02..9e6ed13e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -63,6 +63,9 @@ if CUDA.functional() @safetestset "CUDA SVD" begin include("cuda/svd.jl") end + @safetestset "CUDA General Eigenvalue Decomposition" begin + include("cuda/eig.jl") + end end using AMDGPU