-
Notifications
You must be signed in to change notification settings - Fork 5
Initial attempt to wrap CUSOLVER.Xgeev #47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_ba | |
| using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo | ||
|
|
||
| using CUDA | ||
| using CUDA: @allowscalar | ||
| using CUDA: @allowscalar, i32 | ||
| using CUDA.CUSOLVER | ||
|
|
||
| # QR methods are implemented with full access to allocated arrays, so we do not need to redo this: | ||
|
|
@@ -306,6 +306,73 @@ function Xgesvdr!(A::StridedCuMatrix{T}, | |
| return S, U, Vᴴ | ||
| end | ||
|
|
||
| # Wrapper for general eigensolver | ||
| for (celty, elty) in ((:ComplexF32, :Float32), (:ComplexF64, :Float64), (:ComplexF32, :ComplexF32), (:ComplexF64, :ComplexF64)) | ||
| @eval begin | ||
| function Xgeev!(A::StridedCuMatrix{$elty}, D::StridedCuVector{$celty}, V::StridedCuMatrix{$celty}) | ||
| require_one_based_indexing(A, V, D) | ||
| chkstride1(A, V, D) | ||
| n = checksquare(A) | ||
| # TODO GPU appropriate version | ||
| #chkfinite(A) # balancing routines don't support NaNs and Infs | ||
| n == length(D) || throw(DimensionMismatch("length mismatch between A and D")) | ||
| if length(V) == 0 | ||
| jobvr = 'N' | ||
| elseif length(V) == n*n | ||
| jobvr = 'V' | ||
| else | ||
| throw(DimensionMismatch("size of VR must match size of A")) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we just call it |
||
| end | ||
| jobvl = 'N' # required by API for now (https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdnxgeev) | ||
| #=if length(VL) == 0 | ||
| jobvl = 'N' | ||
| elseif length(VL) == n*n | ||
| jobvl = 'V' | ||
| else | ||
| throw(DimensionMismatch("size of VL must match size of A")) | ||
| end=# | ||
| VL = similar(A, n, 0) | ||
| lda = max(1, stride(A, 2)) | ||
| ldvl = max(1, stride(VL, 2)) | ||
| params = CUSOLVER.CuSolverParameters() | ||
| dh = CUSOLVER.dense_handle() | ||
|
|
||
| if $elty <: Real | ||
| D2 = reinterpret($elty, D) | ||
| # reuse memory, we will have to reorder afterwards to bring real and imaginary | ||
| # components in the order as required for the Complex type | ||
| VR = reinterpret($elty, V) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is true for the eigenvectors from testing I did. For
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The datatype of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is completely ridiculous. Who comes up with writing the documentation in such a confusing way. Or why would you allow complex
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, given what we have, it might still be possible to simplify the implementation a bit by using the complex eigenvalue vector interface. If possible, I would prefer having to maintain CUDA kernels in MatrixAlgebraKit (even though this likely never needs to be touched again). For example, I wouldn't actually mind assigning a temporary real
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have to say that I have very little feeling with the performance implications of any of this, but I can say that I have no faith in cuTENSOR to keep their API stable at all, especially considering that complex numbers seem to be an afterthought for them most of the time anyways.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AMD currently doesn't seem to have general eigenvalue decomposition, but when they do, who knows what interface they will adopt.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically this is all CUSOLVER, btw, not cuTENSOR
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also FWIW historically AMD tends to hew very close to the line CUDA adopts for easily compatibility, since their hip* libraries allow users to "drop in" AMD to replace CUSOLVER/CUBLAS/CUWHATEVER |
||
| else | ||
| D2 = D | ||
| VR = V | ||
| end | ||
| ldvr = max(1, stride(VR, 2)) | ||
| function bufferSize() | ||
| out_cpu = Ref{Csize_t}(0) | ||
| out_gpu = Ref{Csize_t}(0) | ||
| CUSOLVER.cusolverDnXgeev_bufferSize(dh, params, jobvl, jobvr, n, $elty, A, | ||
| lda, $elty, D2, $elty, VL, ldvl, $elty, VR, ldvr, | ||
| $elty, out_gpu, out_cpu) | ||
| out_gpu[], out_cpu[] | ||
| end | ||
| CUDA.with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu | ||
| CUSOLVER.cusolverDnXgeev(dh, params, jobvl, jobvr, n, $elty, A, lda, $elty, | ||
| D2, $elty, VL, ldvl, $elty, VR, ldvr, $elty, buffer_gpu, | ||
| sizeof(buffer_gpu), buffer_cpu, sizeof(buffer_cpu), dh.info) | ||
| end | ||
| flag = @allowscalar dh.info[1] | ||
| CUSOLVER.chkargsok(BlasInt(flag)) | ||
| if eltype(A) <: Real | ||
| work = CuVector{$elty}(undef, n) | ||
| DR = view(D2, 1:n) | ||
| DI = view(D2, (n + 1):(2n)) | ||
| _reorder_realeigendecomposition!(D, DR, DI, work, VR, jobvr) | ||
| end | ||
| return D, V | ||
| end | ||
| end | ||
| end | ||
|
|
||
| # for (jname, bname, fname, elty, relty) in | ||
| # ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32), | ||
| # (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64), | ||
|
|
@@ -650,4 +717,57 @@ end | |
| # end | ||
| # end | ||
|
|
||
| # device code is unreachable by coverage right now | ||
| # COV_EXCL_START | ||
| # TODO use a shmem array here | ||
| function _reorder_kernel_real(real_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T} | ||
| grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x | ||
| @inbounds if grid_idx <= length(real_ev_ixs) | ||
| i = real_ev_ixs[grid_idx] | ||
| for j in n:-1:1 | ||
| VR[2 * j, i] = zero(T) | ||
| VR[2 * j - 1, i] = VR[j, i] | ||
| end | ||
| end | ||
| return | ||
| end | ||
|
|
||
| function _reorder_kernel_complex(complex_ev_ixs, VR::CuDeviceArray{T}, n::Int) where {T} | ||
| grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x | ||
| @inbounds if grid_idx <= length(complex_ev_ixs) | ||
| i = complex_ev_ixs[grid_idx] | ||
| for j in n:-1:1 | ||
| VR[2 * j, i] = VR[j, i + 1] | ||
| VR[2 * j - 1, i] = VR[j, i] | ||
| VR[2 * j, i + 1] = -VR[j, i + 1] | ||
| VR[2 * j - 1, i + 1] = VR[j, i] | ||
| end | ||
| end | ||
| return | ||
| end | ||
| # COV_EXCL_STOP | ||
|
|
||
| function _reorder_realeigendecomposition!(W, WR, WI, work, VR, jobvr) | ||
| # first reorder eigenvalues and recycle work as temporary buffer to efficiently implement the permutation | ||
| copy!(work, WI) | ||
| n = size(W, 1) | ||
| @. W[1:n] = WR[1:n] + im * work[1:n] | ||
| T = eltype(WR) | ||
| if jobvr == 'V' # also reorganise vectors | ||
| real_ev_ixs = findall(isreal, W) | ||
| _cmplx_ev_ixs = findall(!isreal, W) # these come in pairs, choose only the first of each pair | ||
| complex_ev_ixs = view(_cmplx_ev_ixs, 1:2:length(_cmplx_ev_ixs)) | ||
| if !isempty(real_ev_ixs) | ||
| real_threads = 128 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this something we can hard-code or should we do some global
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can make it customizable, yes. Maybe settable during init or something?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So it seems that the whole custom kernels part of the code would be unnecessary if we directly use the support for complex |
||
| real_blocks = max(1, div(length(real_ev_ixs), real_threads)) | ||
| @cuda threads=real_threads blocks=real_blocks _reorder_kernel_real(real_ev_ixs, VR, n) | ||
| end | ||
| if !isempty(complex_ev_ixs) | ||
| complex_threads = 128 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment here |
||
| complex_blocks = max(1, div(length(complex_ev_ixs), complex_threads)) | ||
| @cuda threads=complex_threads blocks=complex_blocks _reorder_kernel_complex(complex_ev_ixs, VR, n) | ||
| end | ||
| end | ||
| end | ||
|
|
||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| using MatrixAlgebraKit | ||
| using LinearAlgebra: Diagonal | ||
| using Test | ||
| using TestExtras | ||
| using StableRNGs | ||
| using CUDA | ||
|
|
||
| include(joinpath("..", "utilities.jl")) | ||
|
|
||
| @testset "eig_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) | ||
| rng = StableRNG(123) | ||
| m = 54 | ||
| for alg in (CUSOLVER_Simple(), :CUSOLVER_Simple, CUSOLVER_Simple) | ||
| A = CuArray(randn(rng, T, m, m)) | ||
| Tc = complex(T) | ||
|
|
||
| D, V = @constinferred eig_full(A; alg=($alg)) | ||
| @test eltype(D) == eltype(V) == Tc | ||
| @test A * V ≈ V * D | ||
|
|
||
| alg′ = @constinferred MatrixAlgebraKit.select_algorithm(eig_full!, A, $alg) | ||
|
|
||
| Ac = similar(A) | ||
| D2, V2 = @constinferred eig_full!(copy!(Ac, A), (D, V), alg′) | ||
| @test D2 === D | ||
| @test V2 === V | ||
| @test A * V ≈ V * D | ||
|
|
||
| Dc = @constinferred eig_vals(A, alg′) | ||
| @test eltype(Dc) == Tc | ||
| @test parent(D) ≈ Dc | ||
| end | ||
| end | ||
|
|
||
| #= | ||
| @testset "eig_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64) | ||
| rng = StableRNG(123) | ||
| m = 54 | ||
| for alg in (CUSOLVER_Simple(),) | ||
| A = CuArray(randn(rng, T, m, m)) | ||
| A *= A' # TODO: deal with eigenvalue ordering etc | ||
| # eigenvalues are sorted by ascending real component... | ||
| D₀ = sort!(eig_vals(A); by=abs, rev=true) | ||
| rmin = findfirst(i -> abs(D₀[end - i]) != abs(D₀[end - i - 1]), 1:(m - 2)) | ||
| r = length(D₀) - rmin | ||
|
|
||
| D1, V1 = @constinferred eig_trunc(A; alg, trunc=truncrank(r)) | ||
| @test length(D1.diag) == r | ||
| @test A * V1 ≈ V1 * D1 | ||
|
|
||
| s = 1 + sqrt(eps(real(T))) | ||
| trunc = trunctol(s * abs(D₀[r + 1])) | ||
| D2, V2 = @constinferred eig_trunc(A; alg, trunc) | ||
| @test length(diagview(D2)) == r | ||
| @test A * V2 ≈ V2 * D2 | ||
|
|
||
| # trunctol keeps order, truncrank might not | ||
| # test for same subspace | ||
| @test V1 * ((V1' * V1) \ (V1' * V2)) ≈ V2 | ||
| @test V2 * ((V2' * V2) \ (V2' * V1)) ≈ V1 | ||
| end | ||
| end | ||
| =# |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was there a specific reason for not having
size(V) == (n, n)?