Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ using MatrixAlgebraKit: @algdef, Algorithm, check_input
using MatrixAlgebraKit: one!, zero!, uppertriangular!, lowertriangular!
using MatrixAlgebraKit: diagview, sign_safe
using MatrixAlgebraKit: LQViaTransposedQR
using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm
import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!
using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm, default_eig_algorithm
import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!, _gpu_geev!
using CUDA
using LinearAlgebra
using LinearAlgebra: BlasFloat
Expand All @@ -23,8 +23,12 @@ end
function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
return CUSOLVER_QRIteration(; kwargs...)
end
function MatrixAlgebraKit.default_eig_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
return CUSOLVER_Simple(; kwargs...)
end


_gpu_geev!(A::StridedCuMatrix, D::StridedCuVector, V::StridedCuMatrix) = YACUSOLVER.Xgeev!(A, D, V)
_gpu_geqrf!(A::StridedCuMatrix) = YACUSOLVER.geqrf!(A)
_gpu_ungqr!(A::StridedCuMatrix, τ::StridedCuVector) = YACUSOLVER.ungqr!(A, τ)
_gpu_unmqr!(side::AbstractChar, trans::AbstractChar, A::StridedCuMatrix, τ::StridedCuVector, C::StridedCuVecOrMat) = YACUSOLVER.unmqr!(side, trans, A, τ, C)
Expand Down
122 changes: 121 additions & 1 deletion ext/MatrixAlgebraKitCUDAExt/yacusolver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_ba
using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo

using CUDA
using CUDA: @allowscalar
using CUDA: @allowscalar, i32
using CUDA.CUSOLVER

# QR methods are implemented with full access to allocated arrays, so we do not need to redo this:
Expand Down Expand Up @@ -306,6 +306,73 @@ function Xgesvdr!(A::StridedCuMatrix{T},
return S, U, Vᴴ
end

# Wrapper for general eigensolver
for (celty, elty) in ((:ComplexF32, :Float32), (:ComplexF64, :Float64), (:ComplexF32, :ComplexF32), (:ComplexF64, :ComplexF64))
@eval begin
function Xgeev!(A::StridedCuMatrix{$elty}, D::StridedCuVector{$celty}, V::StridedCuMatrix{$celty})
require_one_based_indexing(A, V, D)
chkstride1(A, V, D)
n = checksquare(A)
# TODO GPU appropriate version
#chkfinite(A) # balancing routines don't support NaNs and Infs
n == length(D) || throw(DimensionMismatch("length mismatch between A and D"))
if length(V) == 0
jobvr = 'N'
elseif length(V) == n*n
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a specific reason for not having size(V) == (n, n)?

jobvr = 'V'
else
throw(DimensionMismatch("size of VR must match size of A"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we just call it V instead of VR in our function signatures and doc strings.

end
jobvl = 'N' # required by API for now (https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdnxgeev)
#=if length(VL) == 0
jobvl = 'N'
elseif length(VL) == n*n
jobvl = 'V'
else
throw(DimensionMismatch("size of VL must match size of A"))
end=#
VL = similar(A, n, 0)
lda = max(1, stride(A, 2))
ldvl = max(1, stride(VL, 2))
params = CUSOLVER.CuSolverParameters()
dh = CUSOLVER.dense_handle()

if $elty <: Real
D2 = reinterpret($elty, D)
# reuse memory, we will have to reorder afterwards to bring real and imaginary
# components in the order as required for the Complex type
VR = reinterpret($elty, V)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like cusolverDnXgeev supports inputting complex W and VR arguments, and then immediately returns the results as we want them, making the whole reinterpret and _reorder_realeigendecomposition (forced upon us by LAPACK) unneccessary: https://docs.nvidia.com/cuda/cusolver/index.html?highlight=cusolverDnXgeev#cusolverdnxgeev

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is true for the eigenvectors from testing I did. For VL:

Array of dimension ldvl * n. If jobvl = CUSOLVER_EIG_MODE_VECTOR, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If datatypeVL is complex or the j-th eigenvalue is real, then u(j) = VL(:,j), the j-th column of VL. If dataTypeVL is real and the j-th and (j+1)-st eigenvalues form a complex conjugate pair, then u(j) = VL(:,j) + iVL(:,j+1) and u(j+1) = VL(:,j) - iVL(:,j+1). If jobvl = CUSOLVER_EIG_MODE_NOVECTOR, VL is not referenced.

Copy link
Member

@Jutho Jutho Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VL is anyway not supported, but it is the same explanation as for VR. Are we reading this differently:
"If datatypeVL is complex" (so even if A is real) "or the j-th eigenvalue is real" (that doesn't really matter), "then u(j) = VL(:,j), the j-th column of VL."

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The datatype of VR must be same as the datatype of A though per the table further down

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is completely ridiculous. Who comes up with writing the documentation in such a confusing way. Or why would you allow complex W but then not still use the outdated lapack scheme for V. It feels like CUSOLVER managed to make the worst aspects of the LAPACK interface even worse.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, given what we have, it might still be possible to simplify the implementation a bit by using the complex eigenvalue vector interface. If possible, I would prefer having to maintain CUDA kernels in MatrixAlgebraKit (even though this likely never needs to be touched again). For example, I wouldn't actually mind assigning a temporary real VR that is distinct from the V in the arguments, and then simply copying over, but maybe it's possible without. But also interested in @lkdvos opinion. I'll might give it a try and then we can still decide what to do with this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to say that I have very little feeling with the performance implications of any of this, but I can say that I have no faith in cuTENSOR to keep their API stable at all, especially considering that complex numbers seem to be an afterthought for them most of the time anyways.
Additionally this is exclusive to cuTENSOR, so presumably we'd need the kernels anyways for AMD?
I would try and make the choices that minimize maintenance in the long term, but I can't really say I know what that means practically 😉.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD currently doesn't seem to have general eigenvalue decomposition, but when they do, who knows what interface they will adopt.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically this is all CUSOLVER, btw, not cuTENSOR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also FWIW historically AMD tends to hew very close to the line CUDA adopts for easily compatibility, since their hip* libraries allow users to "drop in" AMD to replace CUSOLVER/CUBLAS/CUWHATEVER

else
D2 = D
VR = V
end
ldvr = max(1, stride(VR, 2))
function bufferSize()
out_cpu = Ref{Csize_t}(0)
out_gpu = Ref{Csize_t}(0)
CUSOLVER.cusolverDnXgeev_bufferSize(dh, params, jobvl, jobvr, n, $elty, A,
lda, $elty, D2, $elty, VL, ldvl, $elty, VR, ldvr,
$elty, out_gpu, out_cpu)
out_gpu[], out_cpu[]
end
CUDA.with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu
CUSOLVER.cusolverDnXgeev(dh, params, jobvl, jobvr, n, $elty, A, lda, $elty,
D2, $elty, VL, ldvl, $elty, VR, ldvr, $elty, buffer_gpu,
sizeof(buffer_gpu), buffer_cpu, sizeof(buffer_cpu), dh.info)
end
flag = @allowscalar dh.info[1]
CUSOLVER.chkargsok(BlasInt(flag))
if eltype(A) <: Real
work = CuVector{$elty}(undef, n)
DR = view(D2, 1:n)
DI = view(D2, (n + 1):(2n))
_reorder_realeigendecomposition!(D, DR, DI, work, VR, jobvr)
end
return D, V
end
end
end

# for (jname, bname, fname, elty, relty) in
# ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32),
# (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64),
Expand Down Expand Up @@ -650,4 +717,57 @@ end
# end
# end

# device code is unreachable by coverage right now
# COV_EXCL_START
# TODO use a shmem array here
function _reorder_kernel_real(real_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T}
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
@inbounds if grid_idx <= length(real_ev_ixs)
i = real_ev_ixs[grid_idx]
for j in n:-1:1
VR[2 * j, i] = zero(T)
VR[2 * j - 1, i] = VR[j, i]
end
end
return
end

function _reorder_kernel_complex(complex_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T}
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
@inbounds if grid_idx <= length(complex_ev_ixs)
i = complex_ev_ixs[grid_idx]
for j in n:-1:1
VR[2 * j, i] = VR[j, i + 1]
VR[2 * j - 1, i] = VR[j, i]
VR[2 * j, i + 1] = -VR[j, i + 1]
VR[2 * j - 1, i + 1] = VR[j, i]
end
end
return
end
# COV_EXCL_STOP

function _reorder_realeigendecomposition!(W, WR, WI, work, VR, jobvr)
# first reorder eigenvalues and recycle work as temporary buffer to efficiently implement the permutation
copy!(work, WI)
n = size(W, 1)
@. W[1:n] = WR[1:n] + im * work[1:n]
T = eltype(WR)
if jobvr == 'V' # also reorganise vectors
real_ev_ixs = findall(isreal, W)
_cmplx_ev_ixs = findall(!isreal, W) # these come in pairs, choose only the first of each pair
complex_ev_ixs = view(_cmplx_ev_ixs, 1:2:length(_cmplx_ev_ixs))
if !isempty(real_ev_ixs)
real_threads = 128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this something we can hard-code or should we do some global const _real_threads = Ref(128) kind of thing to make it customizable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make it customizable, yes. Maybe settable during init or something?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it seems that the whole custom kernels part of the code would be unnecessary if we directly use the support for complex W and V from my previous comment, meaning that there would be much less code to maintain, and less parameters to configure.

real_blocks = max(1, div(length(real_ev_ixs), real_threads))
@cuda threads=real_threads blocks=real_blocks _reorder_kernel_real(real_ev_ixs, VR, n)
end
if !isempty(complex_ev_ixs)
complex_threads = 128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment here

complex_blocks = max(1, div(length(complex_ev_ixs), complex_threads))
@cuda threads=complex_threads blocks=complex_blocks _reorder_kernel_complex(complex_ev_ixs, VR, n)
end
end
end

end
1 change: 1 addition & 0 deletions src/MatrixAlgebraKit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export LAPACK_HouseholderQR, LAPACK_HouseholderLQ,
LAPACK_QRIteration, LAPACK_Bisection, LAPACK_MultipleRelativelyRobustRepresentations,
LAPACK_DivideAndConquer, LAPACK_Jacobi,
LQViaTransposedQR,
CUSOLVER_Simple,
CUSOLVER_HouseholderQR, CUSOLVER_QRIteration, CUSOLVER_SVDPolar, CUSOLVER_Jacobi, CUSOLVER_Randomized,
ROCSOLVER_HouseholderQR, ROCSOLVER_QRIteration, ROCSOLVER_Jacobi
export truncrank, trunctol, truncabove, TruncationKeepSorted, TruncationKeepFiltered
Expand Down
26 changes: 26 additions & 0 deletions src/implementations/eig.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,29 @@ function eig_trunc!(A::AbstractMatrix, DV, alg::TruncatedAlgorithm)
D, V = eig_full!(A, DV, alg.alg)
return truncate!(eig_trunc!, (D, V), alg.trunc)
end

_gpu_geev!(A::AbstractMatrix, D, V) = throw(MethodError(_gpu_geev!, (A, D, V)))

function eig_full!(A::AbstractMatrix, DV, alg::GPU_EigAlgorithm)
check_input(eig_full!, A, DV, alg)
D, V = DV
if alg isa GPU_Simple
isempty(alg.kwargs) ||
throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments"))
_gpu_geev!(A, D.diag, V)
end
# TODO: make this controllable using a `gaugefix` keyword argument
V = gaugefix!(V)
return D, V
end

function eig_vals!(A::AbstractMatrix, D, alg::GPU_EigAlgorithm)
check_input(eig_vals!, A, D, alg)
V = similar(A, complex(eltype(A)), (size(A, 1), 0))
if alg isa GPU_Simple
isempty(alg.kwargs) ||
throw(ArgumentError("LAPACK_Simple (geev) does not accept any keyword arguments"))
_gpu_geev!(A, D, V)
end
return D
end
12 changes: 12 additions & 0 deletions src/interface/decompositions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,15 @@ a general matrix using the randomized SVD algorithm.
"""
@algdef CUSOLVER_Randomized

"""
CUSOLVER_Simple()

Algorithm type to denote the simple CUSOLVER driver for computing the non-Hermitian
eigenvalue decomposition of a matrix.
"""
@algdef CUSOLVER_Simple

const CUSOLVER_EigAlgorithm = Union{CUSOLVER_Simple}
# =========================
# ROCSOLVER ALGORITHMS
# =========================
Expand Down Expand Up @@ -192,3 +201,6 @@ Algorithm type to denote the ROCSOLVER driver for computing the singular value d
a general matrix using the Jacobi algorithm.
"""
@algdef ROCSOLVER_Jacobi

const GPU_Simple = Union{CUSOLVER_Simple}
const GPU_EigAlgorithm = Union{GPU_Simple}
63 changes: 63 additions & 0 deletions test/cuda/eig.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using MatrixAlgebraKit
using LinearAlgebra: Diagonal
using Test
using TestExtras
using StableRNGs
using CUDA

include(joinpath("..", "utilities.jl"))

@testset "eig_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
rng = StableRNG(123)
m = 54
for alg in (CUSOLVER_Simple(), :CUSOLVER_Simple, CUSOLVER_Simple)
A = CuArray(randn(rng, T, m, m))
Tc = complex(T)

D, V = @constinferred eig_full(A; alg=($alg))
@test eltype(D) == eltype(V) == Tc
@test A * V ≈ V * D

alg′ = @constinferred MatrixAlgebraKit.select_algorithm(eig_full!, A, $alg)

Ac = similar(A)
D2, V2 = @constinferred eig_full!(copy!(Ac, A), (D, V), alg′)
@test D2 === D
@test V2 === V
@test A * V ≈ V * D

Dc = @constinferred eig_vals(A, alg′)
@test eltype(Dc) == Tc
@test parent(D) ≈ Dc
end
end

#=
@testset "eig_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
rng = StableRNG(123)
m = 54
for alg in (CUSOLVER_Simple(),)
A = CuArray(randn(rng, T, m, m))
A *= A' # TODO: deal with eigenvalue ordering etc
# eigenvalues are sorted by ascending real component...
D₀ = sort!(eig_vals(A); by=abs, rev=true)
rmin = findfirst(i -> abs(D₀[end - i]) != abs(D₀[end - i - 1]), 1:(m - 2))
r = length(D₀) - rmin

D1, V1 = @constinferred eig_trunc(A; alg, trunc=truncrank(r))
@test length(D1.diag) == r
@test A * V1 ≈ V1 * D1

s = 1 + sqrt(eps(real(T)))
trunc = trunctol(s * abs(D₀[r + 1]))
D2, V2 = @constinferred eig_trunc(A; alg, trunc)
@test length(diagview(D2)) == r
@test A * V2 ≈ V2 * D2

# trunctol keeps order, truncrank might not
# test for same subspace
@test V1 * ((V1' * V1) \ (V1' * V2)) ≈ V2
@test V2 * ((V2' * V2) \ (V2' * V1)) ≈ V1
end
end
=#
3 changes: 3 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ if CUDA.functional()
@safetestset "CUDA SVD" begin
include("cuda/svd.jl")
end
@safetestset "CUDA General Eigenvalue Decomposition" begin
include("cuda/eig.jl")
end
end

using AMDGPU
Expand Down
Loading