diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 00000000..970e7600
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,34 @@
+env:
+  SECRET_CODECOV_TOKEN: "MH6hHjQi7vG2V1Yfotv5/z5Dkx1k5SdyGYlGTFXiQr22XksJgsXaBuvFKUrjC7JwcpBsOVU8103LuMKl3m7VJ35WzHZrOssYycVbdGcb2kloc6xvUOsN2R5BrhCQ4Pii0l6ZeVRjCnZVkcmb0Rf4glGFyfibCrqniry8RLhblsuFKFsijRK4OxiWYEs1IvUulN+ER8tEsEtw4+ZqC5nbLGMSnUG/saPkDQOVIBscvikbKEnBcCXBheGPktF+Y/cy/1Xa+FiBPoZcypwTeAjKG1g0MqyHXjaYekb/7fekaj+hukGaeJSCXxY8KEb2IZCh+Y36Tp6y6qsIp/AdtEnCpQ==;U2FsdGVkX18WQxvGLspPwzC4aDe+U7TXU+itebTbgh8LUkE6GukxxReHYiDZ6IrBiVvSGTVJMquW0c8KsOI1pw=="
+
+steps:
+  - label: "Julia v1"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
+
+steps:
+  - label: "Julia LTS"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10" # "lts" isn't valid
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 30
diff --git a/Project.toml b/Project.toml
index ec99e515..efc996b4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,14 +8,17 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [weakdeps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 MatrixAlgebraKitChainRulesCoreExt = "ChainRulesCore"
+MatrixAlgebraKitCUDAExt = "CUDA"
 
 [compat]
 Aqua = "0.6, 0.7, 0.8"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
+CUDA = "5"
 JET = "0.9"
 LinearAlgebra = "1"
 SafeTestsets = "0.1"
@@ -36,5 +39,4 @@ TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "JET", "SafeTestsets", "Test", "TestExtras", "ChainRulesCore",
-        "ChainRulesTestUtils", "StableRNGs", "Zygote"]
+test = ["Aqua", "JET", "SafeTestsets", "Test", "TestExtras","ChainRulesCore", "ChainRulesTestUtils", "StableRNGs", "Zygote", "CUDA"]
diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl
new file mode 100644
index 00000000..01a721f7
--- /dev/null
+++ b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl
@@ -0,0 +1,28 @@
+module MatrixAlgebraKitCUDAExt
+
+using MatrixAlgebraKit
+using MatrixAlgebraKit: @algdef, Algorithm, check_input
+using MatrixAlgebraKit: one!, zero!, uppertriangular!, lowertriangular!
+using MatrixAlgebraKit: diagview, sign_safe
+using MatrixAlgebraKit: LQViaTransposedQR
+using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm 
+using CUDA
+using LinearAlgebra
+using LinearAlgebra: BlasFloat
+
+include("yacusolver.jl")
+include("implementations/qr.jl")
+include("implementations/svd.jl")
+
+function MatrixAlgebraKit.default_qr_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
+    return CUSOLVER_HouseholderQR(; kwargs...)
+end
+function MatrixAlgebraKit.default_lq_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
+    qr_alg = CUSOLVER_HouseholderQR(; kwargs...)
+    return LQViaTransposedQR(qr_alg)
+end
+function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T<:StridedCuMatrix}
+    return CUSOLVER_QRIteration(; kwargs...)
+end
+
+end
diff --git a/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl b/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl
new file mode 100644
index 00000000..636cf82e
--- /dev/null
+++ b/ext/MatrixAlgebraKitCUDAExt/implementations/qr.jl
@@ -0,0 +1,69 @@
+# CUSOLVER QR implementation
+function MatrixAlgebraKit.qr_full!(A::AbstractMatrix, QR, alg::CUSOLVER_HouseholderQR)
+    check_input(qr_full!, A, QR)
+    Q, R = QR
+    _cusolver_qr!(A, Q, R; alg.kwargs...)
+    return Q, R
+end
+function MatrixAlgebraKit.qr_compact!(A::AbstractMatrix, QR, alg::CUSOLVER_HouseholderQR)
+    check_input(qr_compact!, A, QR)
+    Q, R = QR
+    _cusolver_qr!(A, Q, R; alg.kwargs...)
+    return Q, R
+end
+function MatrixAlgebraKit.qr_null!(A::AbstractMatrix, N, alg::CUSOLVER_HouseholderQR)
+    check_input(qr_null!, A, N)
+    _cusolver_qr_null!(A, N; alg.kwargs...)
+    return N
+end
+
+function _cusolver_qr!(A::AbstractMatrix, Q::AbstractMatrix, R::AbstractMatrix;
+                       positive=false, blocksize=1)
+    blocksize > 1 &&
+        throw(ArgumentError("CUSOLVER does not provide a blocked implementation for a QR decomposition"))
+    m, n = size(A)
+    minmn = min(m, n)
+    computeR = length(R) > 0
+    inplaceQ = Q === A
+    if inplaceQ && (computeR || positive || m < n)
+        throw(ArgumentError("inplace Q only supported if matrix is tall (`m >= n`), R is not required and using `positive=false`"))
+    end
+
+    A, τ = YACUSOLVER.geqrf!(A)
+    if inplaceQ
+        Q = YACUSOLVER.ungqr!(A, τ)
+    else
+        Q = YACUSOLVER.unmqr!('L', 'N', A, τ, one!(Q))
+    end
+    # henceforth, τ is no longer needed and can be reused
+
+    if positive # already fix Q even if we do not need R
+        # TODO: report that `lmul!` and `rmul!` with `Diagonal` don't work with CUDA
+        τ .= sign_safe.(diagview(A))
+        Qf = view(Q, 1:m, 1:minmn) # first minmn columns of Q
+        Qf .= Qf .* transpose(τ)
+    end
+
+    if computeR
+        R̃ = uppertriangular!(view(A, axes(R)...))
+        if positive
+            R̃f = view(R̃, 1:minmn, 1:n) # first minmn rows of R
+            R̃f .= conj.(τ) .* R̃f
+        end
+        copyto!(R, R̃)
+    end
+    return Q, R
+end
+
+function _cusolver_qr_null!(A::AbstractMatrix, N::AbstractMatrix;
+                            positive=false, blocksize=1)
+    blocksize > 1 &&
+        throw(ArgumentError("CUSOLVER does not provide a blocked implementation for a QR decomposition"))
+    m, n = size(A)
+    minmn = min(m, n)
+    fill!(N, zero(eltype(N)))
+    one!(view(N, (minmn + 1):m, 1:(m - minmn)))
+    A, τ = YACUSOLVER.geqrf!(A)
+    N = YACUSOLVER.unmqr!('L', 'N', A, τ, N)
+    return N
+end
diff --git a/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl b/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl
new file mode 100644
index 00000000..f9976031
--- /dev/null
+++ b/ext/MatrixAlgebraKitCUDAExt/implementations/svd.jl
@@ -0,0 +1,108 @@
+const CUSOLVER_SVDAlgorithm = Union{CUSOLVER_QRIteration,
+                                    CUSOLVER_SVDPolar,
+                                    CUSOLVER_Jacobi}
+
+# CUSOLVER SVD implementation
+function MatrixAlgebraKit.svd_full!(A::CuMatrix, USVᴴ, alg::CUSOLVER_SVDAlgorithm)
+    check_input(svd_full!, A, USVᴴ)
+    U, S, Vᴴ = USVᴴ
+    fill!(S, zero(eltype(S)))
+    m, n = size(A)
+    minmn = min(m, n)
+    if alg isa CUSOLVER_QRIteration
+        isempty(alg.kwargs) ||
+            throw(ArgumentError("LAPACK_QRIteration does not accept any keyword arguments"))
+        YACUSOLVER.gesvd!(A, view(S, 1:minmn, 1), U, Vᴴ)
+    elseif alg isa CUSOLVER_SVDPolar
+        YACUSOLVER.Xgesvdp!(A, view(S, 1:minmn, 1), U, Vᴴ; alg.kwargs...)
+    elseif alg isa CUSOLVER_Jacobi
+        YACUSOLVER.gesvdj!(A, view(S, 1:minmn, 1), U, Vᴴ; alg.kwargs...)
+        # elseif alg isa LAPACK_Bisection
+        #     throw(ArgumentError("LAPACK_Bisection is not supported for full SVD"))
+        # elseif alg isa LAPACK_Jacobi
+        #     throw(ArgumentError("LAPACK_Bisection is not supported for full SVD"))
+    else
+        throw(ArgumentError("Unsupported SVD algorithm"))
+    end
+    diagview(S) .= view(S, 1:minmn, 1)
+    view(S, 2:minmn, 1) .= zero(eltype(S))
+    # TODO: make this controllable using a `gaugefix` keyword argument
+    for j in 1:max(m, n)
+        if j <= minmn
+            u = view(U, :, j)
+            v = view(Vᴴ, j, :)
+            s = conj(sign(_argmaxabs(u)))
+            u .*= s
+            v .*= conj(s)
+        elseif j <= m
+            u = view(U, :, j)
+            s = conj(sign(_argmaxabs(u)))
+            u .*= s
+        else
+            v = view(Vᴴ, j, :)
+            s = conj(sign(_argmaxabs(v)))
+            v .*= s
+        end
+    end
+    return USVᴴ
+end
+
+function MatrixAlgebraKit.svd_compact!(A::CuMatrix, USVᴴ, alg::CUSOLVER_SVDAlgorithm)
+    check_input(svd_compact!, A, USVᴴ)
+    U, S, Vᴴ = USVᴴ
+    if alg isa CUSOLVER_QRIteration
+        isempty(alg.kwargs) ||
+            throw(ArgumentError("CUSOLVER_QRIteration does not accept any keyword arguments"))
+        YACUSOLVER.gesvd!(A, S.diag, U, Vᴴ)
+    elseif alg isa CUSOLVER_SVDPolar
+        YACUSOLVER.Xgesvdp!(A, S.diag, U, Vᴴ; alg.kwargs...)
+    elseif alg isa CUSOLVER_Jacobi
+        YACUSOLVER.gesvdj!(A, S.diag, U, Vᴴ; alg.kwargs...)
+        # elseif alg isa LAPACK_DivideAndConquer
+        #     isempty(alg.kwargs) ||
+        #         throw(ArgumentError("LAPACK_DivideAndConquer does not accept any keyword arguments"))
+        #     YALAPACK.gesdd!(A, S.diag, U, Vᴴ)
+        # elseif alg isa LAPACK_Bisection
+        #     YALAPACK.gesvdx!(A, S.diag, U, Vᴴ; alg.kwargs...)
+    else
+        throw(ArgumentError("Unsupported SVD algorithm"))
+    end
+    # TODO: make this controllable using a `gaugefix` keyword argument
+    for j in 1:size(U, 2)
+        u = view(U, :, j)
+        v = view(Vᴴ, j, :)
+        s = conj(sign(_argmaxabs(u)))
+        u .*= s
+        v .*= conj(s)
+    end
+    return USVᴴ
+end
+_argmaxabs(x) = reduce(_largest, x; init=zero(eltype(x)))
+_largest(x, y) = abs(x) < abs(y) ? y : x
+
+function MatrixAlgebraKit.svd_vals!(A::CuMatrix, S, alg::CUSOLVER_SVDAlgorithm)
+    check_input(svd_vals!, A, S)
+    U, Vᴴ = similar(A, (0, 0)), similar(A, (0, 0))
+    if alg isa CUSOLVER_QRIteration
+        isempty(alg.kwargs) ||
+            throw(ArgumentError("CUSOLVER_QRIteration does not accept any keyword arguments"))
+        YACUSOLVER.gesvd!(A, S, U, Vᴴ)
+    elseif alg isa CUSOLVER_SVDPolar
+        YACUSOLVER.Xgesvdp!(A, S, U, Vᴴ; alg.kwargs...)
+    elseif alg isa CUSOLVER_Jacobi
+        YACUSOLVER.gesvdj!(A, S, U, Vᴴ; alg.kwargs...)
+        # elseif alg isa LAPACK_DivideAndConquer
+        #     isempty(alg.kwargs) ||
+        #         throw(ArgumentError("LAPACK_DivideAndConquer does not accept any keyword arguments"))
+        #     YALAPACK.gesdd!(A, S, U, Vᴴ)
+        # elseif alg isa LAPACK_Bisection
+        #     YALAPACK.gesvdx!(A, S, U, Vᴴ; alg.kwargs...)
+        # elseif alg isa LAPACK_Jacobi
+        #     isempty(alg.kwargs) ||
+        #         throw(ArgumentError("LAPACK_Jacobi does not accept any keyword arguments"))
+        #     YALAPACK.gesvj!(A, S, U, Vᴴ)
+    else
+        throw(ArgumentError("Unsupported SVD algorithm"))
+    end
+    return S
+end
\ No newline at end of file
diff --git a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl
new file mode 100644
index 00000000..b020d756
--- /dev/null
+++ b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl
@@ -0,0 +1,594 @@
+module YACUSOLVER
+
+using LinearAlgebra
+using LinearAlgebra: BlasInt, BlasFloat, checksquare, chkstride1, require_one_based_indexing
+using LinearAlgebra.LAPACK: chkargsok, chklapackerror, chktrans, chkside, chkdiag, chkuplo
+
+using CUDA
+using CUDA: @allowscalar
+using CUDA.CUSOLVER
+
+# QR methods are implemented with full access to allocated arrays, so we do not need to redo this:
+using CUDA.CUSOLVER: geqrf!, ormqr!, orgqr!
+const unmqr! = ormqr!
+const ungqr! = orgqr!
+
+# Wrapper for SVD via QR Iteration
+for (bname, fname, elty, relty) in
+    ((:cusolverDnSgesvd_bufferSize, :cusolverDnSgesvd, :Float32, :Float32),
+     (:cusolverDnDgesvd_bufferSize, :cusolverDnDgesvd, :Float64, :Float64),
+     (:cusolverDnCgesvd_bufferSize, :cusolverDnCgesvd, :ComplexF32, :Float32),
+     (:cusolverDnZgesvd_bufferSize, :cusolverDnZgesvd, :ComplexF64, :Float64))
+    @eval begin
+        #! format: off
+        function gesvd!(A::StridedCuMatrix{$elty},
+                        S::StridedCuVector{$relty}=similar(A, $relty, min(size(A)...)),
+                        U::StridedCuMatrix{$elty}=similar(A, $elty, size(A, 1), min(size(A)...)),
+                        Vᴴ::StridedCuMatrix{$elty}=similar(A, $elty, min(size(A)...), size(A, 2)))
+        #! format: on
+            chkstride1(A, U, Vᴴ, S)
+            m, n = size(A)
+            (m < n) && throw(ArgumentError("CUSOLVER's gesvd requires m ≥ n"))
+            minmn = min(m, n)
+            if length(U) == 0
+                jobu = 'N'
+            else
+                size(U, 1) == m ||
+                    throw(DimensionMismatch("row size mismatch between A and U"))
+                if size(U, 2) == minmn
+                    if U === A
+                        jobu = 'O'
+                    else
+                        jobu = 'S'
+                    end
+                elseif size(U, 2) == m
+                    jobu = 'A'
+                else
+                    throw(DimensionMismatch("invalid column size of U"))
+                end
+            end
+            if length(Vᴴ) == 0
+                jobvt = 'N'
+            else
+                size(Vᴴ, 2) == n ||
+                    throw(DimensionMismatch("column size mismatch between A and Vᴴ"))
+                if size(Vᴴ, 1) == minmn
+                    if Vᴴ === A
+                        jobvt = 'O'
+                    else
+                        jobvt = 'S'
+                    end
+                elseif size(Vᴴ, 1) == n
+                    jobvt = 'A'
+                else
+                    throw(DimensionMismatch("invalid row size of Vᴴ"))
+                end
+            end
+            length(S) == minmn ||
+                throw(DimensionMismatch("length mismatch between A and S"))
+
+            lda = max(1, stride(A, 2))
+            ldu = max(1, stride(U, 2))
+            ldv = max(1, stride(Vᴴ, 2))
+
+            dh = CUSOLVER.dense_handle()
+            function bufferSize()
+                out = Ref{Cint}(0)
+                CUSOLVER.$bname(dh, m, n, out)
+                return out[] * sizeof($elty)
+            end
+            rwork = CuArray{$relty}(undef, min(m, n) - 1)
+            CUDA.with_workspace(dh.workspace_gpu, bufferSize) do buffer
+                return CUSOLVER.$fname(dh, jobu, jobvt, m, n,
+                                       A, lda, S, U, ldu, Vᴴ, ldv,
+                                       buffer, sizeof(buffer) ÷ sizeof($elty), rwork,
+                                       dh.info)
+            end
+            CUDA.unsafe_free!(rwork)
+
+            info = @allowscalar dh.info[1]
+            CUSOLVER.chkargsok(BlasInt(info))
+
+            return (S, U, Vᴴ)
+        end
+    end
+end
+
+function Xgesvdp!(A::StridedCuMatrix{T},
+                  S::StridedCuVector=similar(A, real(T), min(size(A)...)),
+                  U::StridedCuMatrix{T}=similar(A, T, size(A, 1), min(size(A)...)),
+                  Vᴴ::StridedCuMatrix{T}=similar(A, T, min(size(A)...), size(A, 2));
+                  tol=norm(A) * eps(real(T))) where {T<:BlasFloat}
+    chkstride1(A, U, S, Vᴴ)
+    m, n = size(A)
+    minmn = min(m, n)
+    if length(U) == length(Vᴴ) == 0
+        jobz = 'N'
+        econ = 1
+    else
+        jobz = 'V'
+        size(U, 1) == m ||
+            throw(DimensionMismatch("row size mismatch between A and U"))
+        size(Vᴴ, 2) == n ||
+            throw(DimensionMismatch("column size mismatch between A and Vᴴ"))
+        if size(U, 2) == size(Vᴴ, 1) == minmn
+            econ = 1
+        elseif size(U, 2) == m && size(Vᴴ, 1) == n
+            econ = 0
+        else
+            throw(DimensionMismatch("invalid column size of U or row size of Vᴴ"))
+        end
+    end
+    R = eltype(S)
+    length(S) == minmn ||
+        throw(DimensionMismatch("length mismatch between A and S"))
+    R == real(T) ||
+        throw(ArgumentError("S does not have the matching real `eltype` of A"))
+
+    Ṽ = similar(Vᴴ, (n, n))
+    Ũ = (size(U) == (m, m)) ? U : similar(U, (m, m))
+    lda = max(1, stride(A, 2))
+    ldu = max(1, stride(Ũ, 2))
+    ldv = max(1, stride(Ṽ, 2))
+    h_err_sigma = Ref{Cdouble}(0)
+    params = CUSOLVER.CuSolverParameters()
+    dh = CUSOLVER.dense_handle()
+
+    function bufferSize()
+        out_cpu = Ref{Csize_t}(0)
+        out_gpu = Ref{Csize_t}(0)
+        CUSOLVER.cusolverDnXgesvdp_bufferSize(dh, params, jobz, econ, m, n,
+                                              T, A, lda, R, S, T, Ũ, ldu, T, Ṽ, ldv,
+                                              T, out_gpu, out_cpu)
+
+        return out_gpu[], out_cpu[]
+    end
+    CUSOLVER.with_workspaces(dh.workspace_gpu, dh.workspace_cpu,
+                             bufferSize()...) do buffer_gpu, buffer_cpu
+        return CUSOLVER.cusolverDnXgesvdp(dh, params, jobz, econ, m, n,
+                                          T, A, lda, R, S, T, Ũ, ldu, T, Ṽ, ldv,
+                                          T, buffer_gpu, sizeof(buffer_gpu),
+                                          buffer_cpu, sizeof(buffer_cpu),
+                                          dh.info, h_err_sigma)
+    end
+    err = h_err_sigma[]
+    if err > tol
+        warn("Xgesvdp! did not attained requested tolerance: error = $err > tolerance = $tol")
+    end
+
+    flag = @allowscalar dh.info[1]
+    CUSOLVER.chklapackerror(BlasInt(flag))
+    if Ũ !== U && length(U) > 0
+        U .= view(Ũ, 1:m, 1:size(U, 2))
+    end
+    if length(Vᴴ) > 0
+        Vᴴ .= view(Ṽ', 1:size(Vᴴ, 1), 1:n)
+    end
+    Ũ !== U && CUDA.unsafe_free!(Ũ)
+    CUDA.unsafe_free!(Ṽ)
+
+    return S, U, Vᴴ
+end
+
+# Wrapper for SVD via Jacobi
+for (bname, fname, elty, relty) in
+    ((:cusolverDnSgesvdj_bufferSize, :cusolverDnSgesvdj, :Float32, :Float32),
+     (:cusolverDnDgesvdj_bufferSize, :cusolverDnDgesvdj, :Float64, :Float64),
+     (:cusolverDnCgesvdj_bufferSize, :cusolverDnCgesvdj, :ComplexF32, :Float32),
+     (:cusolverDnZgesvdj_bufferSize, :cusolverDnZgesvdj, :ComplexF64, :Float64))
+    @eval begin
+        #! format: off
+        function gesvdj!(A::StridedCuMatrix{$elty},
+                         S::StridedCuVector{$relty}=similar(A, $relty, min(size(A)...)),
+                         U::StridedCuMatrix{$elty}=similar(A, $elty, size(A, 1), min(size(A)...)),
+                         Vᴴ::StridedCuMatrix{$elty}=similar(A, $elty, min(size(A)...), size(A, 2));
+                         tol::$relty=eps($relty),
+                         max_sweeps::Int=100)
+        #! format: on
+            chkstride1(A, U, Vᴴ, S)
+            m, n = size(A)
+            minmn = min(m, n)
+
+            if length(U) == 0 && length(Vᴴ) == 0
+                jobz = 'N'
+                econ = 0
+            else
+                jobz = 'V'
+                size(U, 1) == m ||
+                    throw(DimensionMismatch("row size mismatch between A and U"))
+                size(Vᴴ, 2) == n ||
+                    throw(DimensionMismatch("column size mismatch between A and Vᴴ"))
+                if size(U, 2) == size(Vᴴ, 1) == minmn
+                    econ = 1
+                elseif size(U, 2) == m && size(Vᴴ, 1) == n
+                    econ = 0
+                else
+                    throw(DimensionMismatch("invalid column size of U or row size of Vᴴ"))
+                end
+            end
+            length(S) == minmn ||
+                throw(DimensionMismatch("length mismatch between A and S"))
+
+            Ṽ = (jobz == 'V') ? similar(Vᴴ') : similar(Vᴴ, (n, minmn))
+            Ũ = (jobz == 'V') ? U : similar(U, (m, minmn))
+            lda = max(1, stride(A, 2))
+            ldu = max(1, stride(Ũ, 2))
+            ldv = max(1, stride(Ṽ, 2))
+
+            params = Ref{CUSOLVER.gesvdjInfo_t}(C_NULL)
+            CUSOLVER.cusolverDnCreateGesvdjInfo(params)
+            CUSOLVER.cusolverDnXgesvdjSetTolerance(params[], tol)
+            CUSOLVER.cusolverDnXgesvdjSetMaxSweeps(params[], max_sweeps)
+            dh = CUSOLVER.dense_handle()
+
+            function bufferSize()
+                out = Ref{Cint}(0)
+                CUSOLVER.$bname(dh, jobz, econ, m, n, A, lda, S, Ũ, ldu, Ṽ, ldv,
+                                out, params[])
+                return out[] * sizeof($elty)
+            end
+
+            CUSOLVER.with_workspace(dh.workspace_gpu, bufferSize) do buffer
+                return CUSOLVER.$fname(dh, jobz, econ, m, n, A, lda, S, Ũ, ldu, Ṽ, ldv,
+                                       buffer, sizeof(buffer) ÷ sizeof($elty), dh.info,
+                                       params[])
+            end
+
+            info = @allowscalar dh.info[1]
+            CUSOLVER.chkargsok(BlasInt(info))
+
+            CUSOLVER.cusolverDnDestroyGesvdjInfo(params[])
+
+            if jobz == 'V'
+                adjoint!(Vᴴ, Ṽ)
+            end
+            return U, S, Vᴴ
+        end
+    end
+end
+
+# for (jname, bname, fname, elty, relty) in
+#     ((:sygvd!, :cusolverDnSsygvd_bufferSize, :cusolverDnSsygvd, :Float32, :Float32),
+#      (:sygvd!, :cusolverDnDsygvd_bufferSize, :cusolverDnDsygvd, :Float64, :Float64),
+#      (:hegvd!, :cusolverDnChegvd_bufferSize, :cusolverDnChegvd, :ComplexF32, :Float32),
+#      (:hegvd!, :cusolverDnZhegvd_bufferSize, :cusolverDnZhegvd, :ComplexF64, :Float64))
+#     @eval begin
+#         function $jname(itype::Int,
+#                         jobz::Char,
+#                         uplo::Char,
+#                         A::StridedCuMatrix{$elty},
+#                         B::StridedCuMatrix{$elty})
+#             chkuplo(uplo)
+#             nA, nB = checksquare(A, B)
+#             if nB != nA
+#                 throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!"))
+#             end
+#             n = nA
+#             lda = max(1, stride(A, 2))
+#             ldb = max(1, stride(B, 2))
+#             W = CuArray{$relty}(undef, n)
+#             dh = dense_handle()
+
+#             function bufferSize()
+#                 out = Ref{Cint}(0)
+#                 $bname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W, out)
+#                 return out[] * sizeof($elty)
+#             end
+
+#             with_workspace(dh.workspace_gpu, bufferSize) do buffer
+#                 return $fname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W,
+#                               buffer, sizeof(buffer) ÷ sizeof($elty), dh.info)
+#             end
+
+#             info = @allowscalar dh.info[1]
+#             chkargsok(BlasInt(info))
+
+#             if jobz == 'N'
+#                 return W
+#             elseif jobz == 'V'
+#                 return W, A, B
+#             end
+#         end
+#     end
+# end
+
+# for (jname, bname, fname, elty, relty) in
+#     ((:sygvj!, :cusolverDnSsygvj_bufferSize, :cusolverDnSsygvj, :Float32, :Float32),
+#      (:sygvj!, :cusolverDnDsygvj_bufferSize, :cusolverDnDsygvj, :Float64, :Float64),
+#      (:hegvj!, :cusolverDnChegvj_bufferSize, :cusolverDnChegvj, :ComplexF32, :Float32),
+#      (:hegvj!, :cusolverDnZhegvj_bufferSize, :cusolverDnZhegvj, :ComplexF64, :Float64))
+#     @eval begin
+#         function $jname(itype::Int,
+#                         jobz::Char,
+#                         uplo::Char,
+#                         A::StridedCuMatrix{$elty},
+#                         B::StridedCuMatrix{$elty};
+#                         tol::$relty=eps($relty),
+#                         max_sweeps::Int=100)
+#             chkuplo(uplo)
+#             nA, nB = checksquare(A, B)
+#             if nB != nA
+#                 throw(DimensionMismatch("Dimensions of A ($nA, $nA) and B ($nB, $nB) must match!"))
+#             end
+#             n = nA
+#             lda = max(1, stride(A, 2))
+#             ldb = max(1, stride(B, 2))
+#             W = CuArray{$relty}(undef, n)
+#             params = Ref{syevjInfo_t}(C_NULL)
+#             cusolverDnCreateSyevjInfo(params)
+#             cusolverDnXsyevjSetTolerance(params[], tol)
+#             cusolverDnXsyevjSetMaxSweeps(params[], max_sweeps)
+#             dh = dense_handle()
+
+#             function bufferSize()
+#                 out = Ref{Cint}(0)
+#                 $bname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W,
+#                        out, params[])
+#                 return out[] * sizeof($elty)
+#             end
+
+#             with_workspace(dh.workspace_gpu, bufferSize) do buffer
+#                 return $fname(dh, itype, jobz, uplo, n, A, lda, B, ldb, W,
+#                               buffer, sizeof(buffer) ÷ sizeof($elty), dh.info, params[])
+#             end
+
+#             info = @allowscalar dh.info[1]
+#             chkargsok(BlasInt(info))
+
+#             cusolverDnDestroySyevjInfo(params[])
+
+#             if jobz == 'N'
+#                 return W
+#             elseif jobz == 'V'
+#                 return W, A, B
+#             end
+#         end
+#     end
+# end
+
+# for (jname, bname, fname, elty, relty) in
+#     ((:syevjBatched!, :cusolverDnSsyevjBatched_bufferSize, :cusolverDnSsyevjBatched,
+#       :Float32, :Float32),
+#      (:syevjBatched!, :cusolverDnDsyevjBatched_bufferSize, :cusolverDnDsyevjBatched,
+#       :Float64, :Float64),
+#      (:heevjBatched!, :cusolverDnCheevjBatched_bufferSize, :cusolverDnCheevjBatched,
+#       :ComplexF32, :Float32),
+#      (:heevjBatched!, :cusolverDnZheevjBatched_bufferSize, :cusolverDnZheevjBatched,
+#       :ComplexF64, :Float64))
+#     @eval begin
+#         function $jname(jobz::Char,
+#                         uplo::Char,
+#                         A::StridedCuArray{$elty};
+#                         tol::$relty=eps($relty),
+#                         max_sweeps::Int=100)
+
+#             # Set up information for the solver arguments
+#             chkuplo(uplo)
+#             n = checksquare(A)
+#             lda = max(1, stride(A, 2))
+#             batchSize = size(A, 3)
+#             W = CuArray{$relty}(undef, n, batchSize)
+#             params = Ref{syevjInfo_t}(C_NULL)
+
+#             dh = dense_handle()
+#             resize!(dh.info, batchSize)
+
+#             # Initialize the solver parameters
+#             cusolverDnCreateSyevjInfo(params)
+#             cusolverDnXsyevjSetTolerance(params[], tol)
+#             cusolverDnXsyevjSetMaxSweeps(params[], max_sweeps)
+
+#             # Calculate the workspace size
+#             function bufferSize()
+#                 out = Ref{Cint}(0)
+#                 $bname(dh, jobz, uplo, n, A, lda, W, out, params[], batchSize)
+#                 return out[] * sizeof($elty)
+#             end
+
+#             # Run the solver
+#             with_workspace(dh.workspace_gpu, bufferSize) do buffer
+#                 return $fname(dh, jobz, uplo, n, A, lda, W, buffer,
+#                               sizeof(buffer) ÷ sizeof($elty), dh.info, params[], batchSize)
+#             end
+
+#             # Copy the solver info and delete the device memory
+#             info = @allowscalar collect(dh.info)
+
+#             # Double check the solver's exit status
+#             for i in 1:batchSize
+#                 chkargsok(BlasInt(info[i]))
+#             end
+
+#             cusolverDnDestroySyevjInfo(params[])
+
+#             # Return eigenvalues (in W) and possibly eigenvectors (in A)
+#             if jobz == 'N'
+#                 return W
+#             elseif jobz == 'V'
+#                 return W, A
+#             end
+#         end
+#     end
+# end
+
+# for (fname, elty) in ((:cusolverDnSpotrsBatched, :Float32),
+#                       (:cusolverDnDpotrsBatched, :Float64),
+#                       (:cusolverDnCpotrsBatched, :ComplexF32),
+#                       (:cusolverDnZpotrsBatched, :ComplexF64))
+#     @eval begin
+#         function potrsBatched!(uplo::Char,
+#                                A::Vector{<:StridedCuMatrix{$elty}},
+#                                B::Vector{<:StridedCuVecOrMat{$elty}})
+#             if length(A) != length(B)
+#                 throw(DimensionMismatch(""))
+#             end
+#             # Set up information for the solver arguments
+#             chkuplo(uplo)
+#             n = checksquare(A[1])
+#             if size(B[1], 1) != n
+#                 throw(DimensionMismatch("first dimension of B[i], $(size(B[1],1)), must match second dimension of A, $n"))
+#             end
+#             nrhs = size(B[1], 2)
+#             # cuSOLVER's Remark 1: only nrhs=1 is supported.
+#             if nrhs != 1
+#                 throw(ArgumentError("cuSOLVER only supports vectors for B"))
+#             end
+#             lda = max(1, stride(A[1], 2))
+#             ldb = max(1, stride(B[1], 2))
+#             batchSize = length(A)
+
+#             Aptrs = unsafe_batch(A)
+#             Bptrs = unsafe_batch(B)
+
+#             dh = dense_handle()
+
+#             # Run the solver
+#             $fname(dh, uplo, n, nrhs, Aptrs, lda, Bptrs, ldb, dh.info, batchSize)
+
+#             # Copy the solver info and delete the device memory
+#             info = @allowscalar dh.info[1]
+#             chklapackerror(BlasInt(info))
+
+#             return B
+#         end
+#     end
+# end
+
+# for (fname, elty) in ((:cusolverDnSpotrfBatched, :Float32),
+#                       (:cusolverDnDpotrfBatched, :Float64),
+#                       (:cusolverDnCpotrfBatched, :ComplexF32),
+#                       (:cusolverDnZpotrfBatched, :ComplexF64))
+#     @eval begin
+#         function potrfBatched!(uplo::Char, A::Vector{<:StridedCuMatrix{$elty}})
+
+#             # Set up information for the solver arguments
+#             chkuplo(uplo)
+#             n = checksquare(A[1])
+#             lda = max(1, stride(A[1], 2))
+#             batchSize = length(A)
+
+#             Aptrs = unsafe_batch(A)
+
+#             dh = dense_handle()
+#             resize!(dh.info, batchSize)
+
+#             # Run the solver
+#             $fname(dh, uplo, n, Aptrs, lda, dh.info, batchSize)
+
+#             # Copy the solver info and delete the device memory
+#             info = @allowscalar collect(dh.info)
+
+#             # Double check the solver's exit status
+#             for i in 1:batchSize
+#                 chkargsok(BlasInt(info[i]))
+#             end
+
+#             # info[i] > 0 means the leading minor of order info[i] is not positive definite
+#             # LinearAlgebra.LAPACK does not throw Exception here
+#             # to simplify calls to isposdef! and factorize
+#             return A, info
+#         end
+#     end
+# end
+
+# # gesv
+# function gesv!(X::CuVecOrMat{T}, A::CuMatrix{T}, B::CuVecOrMat{T}; fallback::Bool=true,
+#                residual_history::Bool=false, irs_precision::String="AUTO",
+#                refinement_solver::String="CLASSICAL",
+#                maxiters::Int=0, maxiters_inner::Int=0, tol::Float64=0.0,
+#                tol_inner=Float64 = 0.0) where {T<:BlasFloat}
+#     params = CuSolverIRSParameters()
+#     info = CuSolverIRSInformation()
+#     n = checksquare(A)
+#     nrhs = size(B, 2)
+#     lda = max(1, stride(A, 2))
+#     ldb = max(1, stride(B, 2))
+#     ldx = max(1, stride(X, 2))
+#     niters = Ref{Cint}()
+#     dh = dense_handle()
+
+#     if irs_precision == "AUTO"
+#         (T == Float32) && (irs_precision = "R_32F")
+#         (T == Float64) && (irs_precision = "R_64F")
+#         (T == ComplexF32) && (irs_precision = "C_32F")
+#         (T == ComplexF64) && (irs_precision = "C_64F")
+#     else
+#         (T == Float32) && (irs_precision ∈ ("R_32F", "R_16F", "R_16BF", "R_TF32") ||
+#                            error("$irs_precision is not supported."))
+#         (T == Float64) &&
+#             (irs_precision ∈ ("R_64F", "R_32F", "R_16F", "R_16BF", "R_TF32") ||
+#              error("$irs_precision is not supported."))
+#         (T == ComplexF32) && (irs_precision ∈ ("C_32F", "C_16F", "C_16BF", "C_TF32") ||
+#                               error("$irs_precision is not supported."))
+#         (T == ComplexF64) &&
+#             (irs_precision ∈ ("C_64F", "C_32F", "C_16F", "C_16BF", "C_TF32") ||
+#              error("$irs_precision is not supported."))
+#     end
+#     cusolverDnIRSParamsSetSolverMainPrecision(params, T)
+#     cusolverDnIRSParamsSetSolverLowestPrecision(params, irs_precision)
+#     cusolverDnIRSParamsSetRefinementSolver(params, refinement_solver)
+#     (tol != 0.0) && cusolverDnIRSParamsSetTol(params, tol)
+#     (tol_inner != 0.0) && cusolverDnIRSParamsSetTolInner(params, tol_inner)
+#     (maxiters != 0) && cusolverDnIRSParamsSetMaxIters(params, maxiters)
+#     (maxiters_inner != 0) && cusolverDnIRSParamsSetMaxItersInner(params, maxiters_inner)
+#     fallback ? cusolverDnIRSParamsEnableFallback(params) :
+#     cusolverDnIRSParamsDisableFallback(params)
+#     residual_history && cusolverDnIRSInfosRequestResidual(info)
+
+#     function bufferSize()
+#         buffer_size = Ref{Csize_t}(0)
+#         cusolverDnIRSXgesv_bufferSize(dh, params, n, nrhs, buffer_size)
+#         return buffer_size[]
+#     end
+
+#     with_workspace(dh.workspace_gpu, bufferSize) do buffer
+#         return cusolverDnIRSXgesv(dh, params, info, n, nrhs, A, lda, B, ldb,
+#                                   X, ldx, buffer, sizeof(buffer), niters, dh.info)
+#     end
+
+#     # Copy the solver flag and delete the device memory
+#     flag = @allowscalar dh.info[1]
+#     chklapackerror(BlasInt(flag))
+
+#     return X, info
+# end
+
+# for (jname, bname, fname, elty, relty) in
+#     ((:syevd!, :cusolverDnSsyevd_bufferSize, :cusolverDnSsyevd, :Float32, :Float32),
+#      (:syevd!, :cusolverDnDsyevd_bufferSize, :cusolverDnDsyevd, :Float64, :Float64),
+#      (:heevd!, :cusolverDnCheevd_bufferSize, :cusolverDnCheevd, :ComplexF32, :Float32),
+#      (:heevd!, :cusolverDnZheevd_bufferSize, :cusolverDnZheevd, :ComplexF64, :Float64))
+#     @eval begin
+#         function $jname(jobz::Char,
+#                         uplo::Char,
+#                         A::StridedCuMatrix{$elty})
+#             chkuplo(uplo)
+#             n = checksquare(A)
+#             lda = max(1, stride(A, 2))
+#             W = CuArray{$relty}(undef, n)
+#             dh = dense_handle()
+
+#             function bufferSize()
+#                 out = Ref{Cint}(0)
+#                 $bname(dh, jobz, uplo, n, A, lda, W, out)
+#                 return out[] * sizeof($elty)
+#             end
+
+#             with_workspace(dh.workspace_gpu, bufferSize) do buffer
+#                 return $fname(dh, jobz, uplo, n, A, lda, W,
+#                               buffer, sizeof(buffer) ÷ sizeof($elty), dh.info)
+#             end
+
+#             info = @allowscalar dh.info[1]
+#             chkargsok(BlasInt(info))
+
+#             if jobz == 'N'
+#                 return W
+#             elseif jobz == 'V'
+#                 return W, A
+#             end
+#         end
+#     end
+# end
+
+end
\ No newline at end of file
diff --git a/src/MatrixAlgebraKit.jl b/src/MatrixAlgebraKit.jl
index 567c1510..5f872bb5 100644
--- a/src/MatrixAlgebraKit.jl
+++ b/src/MatrixAlgebraKit.jl
@@ -2,12 +2,12 @@ module MatrixAlgebraKit
 
 using LinearAlgebra: LinearAlgebra
 using LinearAlgebra: norm # TODO: eleminate if we use VectorInterface.jl?
-using LinearAlgebra: mul!, rmul!, lmul!
+using LinearAlgebra: mul!, rmul!, lmul!, adjoint!, rdiv!, ldiv!
 using LinearAlgebra: sylvester
 using LinearAlgebra: isposdef, ishermitian
 using LinearAlgebra: Diagonal, diag, diagind
 using LinearAlgebra: UpperTriangular, LowerTriangular
-using LinearAlgebra: BlasFloat, BlasReal, BlasComplex, BlasInt, triu!, tril!, rdiv!, ldiv!
+using LinearAlgebra: BlasFloat, BlasReal, BlasComplex, BlasInt
 
 export isisometry, isunitary
 
@@ -29,7 +29,9 @@ export left_orth!, right_orth!, left_null!, right_null!
 export LAPACK_HouseholderQR, LAPACK_HouseholderLQ,
        LAPACK_Simple, LAPACK_Expert,
        LAPACK_QRIteration, LAPACK_Bisection, LAPACK_MultipleRelativelyRobustRepresentations,
-       LAPACK_DivideAndConquer, LAPACK_Jacobi
+       LAPACK_DivideAndConquer, LAPACK_Jacobi,
+       LQViaTransposedQR,
+       CUSOLVER_HouseholderQR, CUSOLVER_QRIteration, CUSOLVER_SVDPolar, CUSOLVER_Jacobi
 export truncrank, trunctol, truncabove, TruncationKeepSorted, TruncationKeepFiltered
 
 VERSION >= v"1.11.0-DEV.469" &&
@@ -46,6 +48,7 @@ include("common/matrixproperties.jl")
 
 include("yalapack.jl")
 include("algorithms.jl")
+include("interface/decompositions.jl")
 include("interface/qr.jl")
 include("interface/lq.jl")
 include("interface/svd.jl")
@@ -55,7 +58,6 @@ include("interface/schur.jl")
 include("interface/polar.jl")
 include("interface/orthnull.jl")
 
-include("implementations/decompositions.jl")
 include("implementations/truncation.jl")
 include("implementations/qr.jl")
 include("implementations/lq.jl")
diff --git a/src/algorithms.jl b/src/algorithms.jl
index af183103..7dbfe46e 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -146,7 +146,7 @@ macro algdef(name)
                 return $name{typeof(kw)}(kw)
             end
             function Base.show(io::IO, alg::$name)
-                return _show_alg(io, alg)
+                return ($_show_alg)(io, alg)
             end
 
             Core.@__doc__ $name
diff --git a/src/common/initialization.jl b/src/common/initialization.jl
index 6a5cba78..b8bc5faf 100644
--- a/src/common/initialization.jl
+++ b/src/common/initialization.jl
@@ -1,11 +1,32 @@
 # TODO: Consider using zerovector! if using VectorInterface.jl
-function zero!(A::AbstractMatrix)
+function zero!(A::AbstractArray)
     A .= zero(eltype(A))
     return A
 end
 
 function one!(A::AbstractMatrix)
     length(A) > 0 || return A
-    copyto!(A, LinearAlgebra.I)
+    zero!(A)
+    diagview(A) .= one(eltype(A))
     return A
 end
+
+function uppertriangular!(A::AbstractMatrix)
+    Base.require_one_based_indexing(A)
+    m, n = size(A)
+    for i in 1:n
+        r = (i + 1):m
+        zero!(view(A, r, i))
+    end
+    return A
+end
+
+function lowertriangular!(A::AbstractMatrix)
+    Base.require_one_based_indexing(A)
+    m, n = size(A)
+    for i in 2:n
+        r = 1:min(i - 1, m)
+        zero!(view(A, r, i))
+    end
+    return A
+end
\ No newline at end of file
diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl
index 6a48e195..6d0efcca 100644
--- a/src/implementations/eig.jl
+++ b/src/implementations/eig.jl
@@ -30,14 +30,14 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(eig_full!), A::AbstractMatrix, ::LAPACK_EigAlgorithm)
+function initialize_output(::typeof(eig_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     Tc = complex(eltype(A))
     D = Diagonal(similar(A, Tc, n))
     V = similar(A, Tc, (n, n))
     return (D, V)
 end
-function initialize_output(::typeof(eig_vals!), A::AbstractMatrix, ::LAPACK_EigAlgorithm)
+function initialize_output(::typeof(eig_vals!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     Tc = complex(eltype(A))
     D = similar(A, Tc, n)
diff --git a/src/implementations/eigh.jl b/src/implementations/eigh.jl
index a1c6c779..5178748e 100644
--- a/src/implementations/eigh.jl
+++ b/src/implementations/eigh.jl
@@ -29,13 +29,13 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(eigh_full!), A::AbstractMatrix, ::LAPACK_EighAlgorithm)
+function initialize_output(::typeof(eigh_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     D = Diagonal(similar(A, real(eltype(A)), n))
     V = similar(A, (n, n))
     return (D, V)
 end
-function initialize_output(::typeof(eigh_vals!), A::AbstractMatrix, ::LAPACK_EighAlgorithm)
+function initialize_output(::typeof(eigh_vals!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     D = similar(A, real(eltype(A)), n)
     return D
diff --git a/src/implementations/lq.jl b/src/implementations/lq.jl
index 165c63b5..6c81ca12 100644
--- a/src/implementations/lq.jl
+++ b/src/implementations/lq.jl
@@ -42,20 +42,20 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(lq_full!), A::AbstractMatrix, ::LAPACK_HouseholderLQ)
+function initialize_output(::typeof(lq_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     L = similar(A, (m, n))
     Q = similar(A, (n, n))
     return (L, Q)
 end
-function initialize_output(::typeof(lq_compact!), A::AbstractMatrix, ::LAPACK_HouseholderLQ)
+function initialize_output(::typeof(lq_compact!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     minmn = min(m, n)
     L = similar(A, (m, minmn))
     Q = similar(A, (minmn, n))
     return (L, Q)
 end
-function initialize_output(::typeof(lq_null!), A::AbstractMatrix, ::LAPACK_HouseholderLQ)
+function initialize_output(::typeof(lq_null!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     minmn = min(m, n)
     Nᴴ = similar(A, (n - minmn, n))
@@ -71,17 +71,34 @@ function lq_full!(A::AbstractMatrix, LQ, alg::LAPACK_HouseholderLQ)
     _lapack_lq!(A, L, Q; alg.kwargs...)
     return L, Q
 end
+function lq_full!(A::AbstractMatrix, LQ, alg::LQViaTransposedQR)
+    check_input(lq_full!, A, LQ)
+    L, Q = LQ
+    lq_via_qr!(A, L, Q, alg.qr_alg)
+    return L, Q
+end
 function lq_compact!(A::AbstractMatrix, LQ, alg::LAPACK_HouseholderLQ)
     check_input(lq_compact!, A, LQ)
     L, Q = LQ
     _lapack_lq!(A, L, Q; alg.kwargs...)
     return L, Q
 end
+function lq_compact!(A::AbstractMatrix, LQ, alg::LQViaTransposedQR)
+    check_input(lq_compact!, A, LQ)
+    L, Q = LQ
+    lq_via_qr!(A, L, Q, alg.qr_alg)
+    return L, Q
+end
 function lq_null!(A::AbstractMatrix, Nᴴ, alg::LAPACK_HouseholderLQ)
     check_input(lq_null!, A, Nᴴ)
     _lapack_lq_null!(A, Nᴴ; alg.kwargs...)
     return Nᴴ
 end
+function lq_null!(A::AbstractMatrix, Nᴴ, alg::LQViaTransposedQR)
+    check_input(lq_null!, A, Nᴴ)
+    lq_null_via_qr!(A, Nᴴ, alg.qr_alg)
+    return Nᴴ
+end
 
 function _lapack_lq!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix;
                      positive=false,
@@ -126,7 +143,7 @@ function _lapack_lq!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix;
     end
 
     if computeL
-        L̃ = tril!(view(A, axes(L)...))
+        L̃ = lowertriangular!(view(A, axes(L)...))
         if positive
             @inbounds for j in 1:minmn
                 s = conj(sign_safe(L̃[j, j]))
@@ -158,3 +175,31 @@ function _lapack_lq_null!(A::AbstractMatrix, Nᴴ::AbstractMatrix;
     end
     return Nᴴ
 end
+
+# LQ via transposition and QR
+function lq_via_qr!(A::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix,
+                    qr_alg::AbstractAlgorithm)
+    m, n = size(A)
+    minmn = min(m, n)
+    At = adjoint!(similar(A'), A)::AbstractMatrix
+    Qt = (A === Q) ? At : similar(Q')
+    Lt = similar(L')
+    if size(Q) == (n, n)
+        Qt, Lt = qr_full!(At, (Qt, Lt), qr_alg)
+    else
+        Qt, Lt = qr_compact!(At, (Qt, Lt), qr_alg)
+    end
+    adjoint!(Q, Qt)
+    !isempty(L) && adjoint!(L, Lt)
+    return L, Q
+end
+
+function lq_null_via_qr!(A::AbstractMatrix, N::AbstractMatrix, qr_alg::AbstractAlgorithm)
+    m, n = size(A)
+    minmn = min(m, n)
+    At = adjoint!(similar(A'), A)::AbstractMatrix
+    Nt = similar(N')
+    Nt = qr_null!(At, Nt, qr_alg)
+    !isempty(N) && adjoint!(N, Nt)
+    return N
+end
diff --git a/src/implementations/polar.jl b/src/implementations/polar.jl
index 2604aab5..fabb261c 100644
--- a/src/implementations/polar.jl
+++ b/src/implementations/polar.jl
@@ -30,13 +30,13 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(left_polar!), A::AbstractMatrix, ::PolarViaSVD)
+function initialize_output(::typeof(left_polar!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     W = similar(A)
     P = similar(A, (n, n))
     return (W, P)
 end
-function initialize_output(::typeof(right_polar!), A::AbstractMatrix, ::PolarViaSVD)
+function initialize_output(::typeof(right_polar!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     P = similar(A, (m, m))
     Wᴴ = similar(A)
diff --git a/src/implementations/qr.jl b/src/implementations/qr.jl
index 7e2e13ea..1d30b4b1 100644
--- a/src/implementations/qr.jl
+++ b/src/implementations/qr.jl
@@ -42,20 +42,20 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(qr_full!), A::AbstractMatrix, ::LAPACK_HouseholderQR)
+function initialize_output(::typeof(qr_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     Q = similar(A, (m, m))
     R = similar(A, (m, n))
     return (Q, R)
 end
-function initialize_output(::typeof(qr_compact!), A::AbstractMatrix, ::LAPACK_HouseholderQR)
+function initialize_output(::typeof(qr_compact!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     minmn = min(m, n)
     Q = similar(A, (m, minmn))
     R = similar(A, (minmn, n))
     return (Q, R)
 end
-function initialize_output(::typeof(qr_null!), A::AbstractMatrix, ::LAPACK_HouseholderQR)
+function initialize_output(::typeof(qr_null!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     minmn = min(m, n)
     N = similar(A, (m, m - minmn))
@@ -130,7 +130,7 @@ function _lapack_qr!(A::AbstractMatrix, Q::AbstractMatrix, R::AbstractMatrix;
     end
 
     if computeR
-        R̃ = triu!(view(A, axes(R)...))
+        R̃ = uppertriangular!(view(A, axes(R)...))
         if positive
             @inbounds for j in n:-1:1
                 @simd for i in 1:min(minmn, j)
diff --git a/src/implementations/schur.jl b/src/implementations/schur.jl
index 55a1bdfa..541ae97f 100644
--- a/src/implementations/schur.jl
+++ b/src/implementations/schur.jl
@@ -28,13 +28,13 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(schur_full!), A::AbstractMatrix, ::LAPACK_EigAlgorithm)
+function initialize_output(::typeof(schur_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     Z = similar(A, (n, n))
     vals = similar(A, complex(eltype(A)), n)
     return (A, Z, vals)
 end
-function initialize_output(::typeof(schur_vals!), A::AbstractMatrix, ::LAPACK_EigAlgorithm)
+function initialize_output(::typeof(schur_vals!), A::AbstractMatrix, ::AbstractAlgorithm)
     n = size(A, 1) # square check will happen later
     vals = similar(A, complex(eltype(A)), n)
     return vals
diff --git a/src/implementations/svd.jl b/src/implementations/svd.jl
index a7d4c9e5..031494f8 100644
--- a/src/implementations/svd.jl
+++ b/src/implementations/svd.jl
@@ -44,14 +44,14 @@ end
 
 # Outputs
 # -------
-function initialize_output(::typeof(svd_full!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm)
+function initialize_output(::typeof(svd_full!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     U = similar(A, (m, m))
     S = similar(A, real(eltype(A)), (m, n)) # TODO: Rectangular diagonal type?
     Vᴴ = similar(A, (n, n))
     return (U, S, Vᴴ)
 end
-function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm)
+function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::AbstractAlgorithm)
     m, n = size(A)
     minmn = min(m, n)
     U = similar(A, (m, minmn))
@@ -59,7 +59,7 @@ function initialize_output(::typeof(svd_compact!), A::AbstractMatrix, ::LAPACK_S
     Vᴴ = similar(A, (minmn, n))
     return (U, S, Vᴴ)
 end
-function initialize_output(::typeof(svd_vals!), A::AbstractMatrix, ::LAPACK_SVDAlgorithm)
+function initialize_output(::typeof(svd_vals!), A::AbstractMatrix, ::AbstractAlgorithm)
     return similar(A, real(eltype(A)), (min(size(A)...),))
 end
 function initialize_output(::typeof(svd_trunc!), A::AbstractMatrix, alg::TruncatedAlgorithm)
diff --git a/src/implementations/decompositions.jl b/src/interface/decompositions.jl
similarity index 74%
rename from src/implementations/decompositions.jl
rename to src/interface/decompositions.jl
index d886588a..bff490a9 100644
--- a/src/implementations/decompositions.jl
+++ b/src/interface/decompositions.jl
@@ -1,8 +1,8 @@
 # TODO: module Decompositions?
 
-# ==========
-# ALGORITHMS
-# ==========
+# =================
+# LAPACK ALGORITHMS
+# =================
 
 # reference for naming LAPACK algorithms:
 # https://www.netlib.org/lapack/explore-html/topics.html
@@ -112,3 +112,41 @@ const LAPACK_SVDAlgorithm = Union{LAPACK_QRIteration,
                                   LAPACK_Bisection,
                                   LAPACK_DivideAndConquer,
                                   LAPACK_Jacobi}
+
+# =========================
+# CUSOLVER ALGORITHMS
+# =========================
+"""
+    CUSOLVER_HouseholderQR(; positive = false)
+
+Algorithm type to denote the standard CUSOLVER algorithm for computing the QR decomposition of
+a matrix using Householder reflectors. The keyword `positive=true` can be used to ensure that
+the diagonal elements of `R` are non-negative.
+"""
+@algdef CUSOLVER_HouseholderQR
+
+"""
+    CUSOLVER_QRIteration()
+
+Algorithm type to denote the CUSOLVER driver for computing the eigenvalue decomposition of a
+Hermitian matrix, or the singular value decomposition of a general matrix using the
+QR Iteration algorithm.
+"""
+@algdef CUSOLVER_QRIteration
+
+"""
+    CUSOLVER_SVDPolar()
+
+Algorithm type to denote the CUSOLVER driver for computing the singular value decomposition of
+a general matrix by using Halley's iterative algorithm to compute the polar decompositon,
+followed by the hermitian eigenvalue decomposition of the positive definite factor.
+"""
+@algdef CUSOLVER_SVDPolar
+
+"""
+    CUSOLVER_Jacobi()
+
+Algorithm type to denote the CUSOLVER driver for computing the singular value decomposition of
+a general matrix using the Jacobi algorithm.
+"""
+@algdef CUSOLVER_Jacobi
diff --git a/src/interface/lq.jl b/src/interface/lq.jl
index 6f1ed12f..338302ef 100644
--- a/src/interface/lq.jl
+++ b/src/interface/lq.jl
@@ -81,3 +81,13 @@ for f in (:lq_full!, :lq_compact!, :lq_null!)
         return default_lq_algorithm(A; kwargs...)
     end
 end
+
+# Alternative algorithm (necessary for CUDA)
+struct LQViaTransposedQR{A<:AbstractAlgorithm} <: AbstractAlgorithm
+    qr_alg::A
+end
+function Base.show(io::IO, alg::LQViaTransposedQR)
+    print(io, "LQViaTransposedQR(")
+    _show_alg(io, alg.qr_alg)
+    return print(io, ")")
+end
diff --git a/test/cuda/lq.jl b/test/cuda/lq.jl
new file mode 100644
index 00000000..cc1def1c
--- /dev/null
+++ b/test/cuda/lq.jl
@@ -0,0 +1,117 @@
+using MatrixAlgebraKit
+using MatrixAlgebraKit: diagview
+using Test
+using TestExtras
+using StableRNGs
+using CUDA
+
+include("utilities.jl")
+
+@testset "lq_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    for n in (37, m, 63)
+        minmn = min(m, n)
+        A = CuArray(randn(rng, T, m, n))
+        L, Q = @constinferred lq_compact(A)
+        @test L isa CuMatrix{T} && size(L) == (m, minmn)
+        @test Q isa CuMatrix{T} && size(Q) == (minmn, n)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+        Nᴴ = @constinferred lq_null(A)
+        @test Nᴴ isa CuMatrix{T} && size(Nᴴ) == (n - minmn, n)
+        @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3)
+        @test isapproxone(Nᴴ * Nᴴ')
+
+        Ac = similar(A)
+        L2, Q2 = @constinferred lq_compact!(copy!(Ac, A), (L, Q))
+        @test L2 === L
+        @test Q2 === Q
+        Nᴴ2 = @constinferred lq_null!(copy!(Ac, A), Nᴴ)
+        @test Nᴴ2 === Nᴴ
+
+        # noL
+        noL = similar(A, 0, minmn)
+        Q2 = similar(Q)
+        lq_compact!(copy!(Ac, A), (noL, Q2))
+        @test Q == Q2
+
+        # positive
+        lq_compact!(copy!(Ac, A), (L, Q); positive=true)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+        @test all(>=(zero(real(T))), real(diagview(L)))
+        lq_compact!(copy!(Ac, A), (noL, Q2); positive=true)
+        @test Q == Q2
+
+        # explicit blocksize
+        lq_compact!(copy!(Ac, A), (L, Q); blocksize=1)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+        lq_compact!(copy!(Ac, A), (noL, Q2); blocksize=1)
+        @test Q == Q2
+        lq_null!(copy!(Ac, A), Nᴴ; blocksize=1)
+        @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3)
+        @test isapproxone(Nᴴ * Nᴴ')
+        if m <= n
+            lq_compact!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q
+            @test Q ≈ Q2
+            # these do not work because of the in-place Q
+            @test_throws ArgumentError lq_compact!(copy!(Q2, A), (L, Q2); blocksize=1)
+            @test_throws ArgumentError lq_compact!(copy!(Q2, A), (noL, Q2); positive=true)
+        end
+        # no blocked CUDA
+        @test_throws ArgumentError lq_compact!(copy!(Q2, A), (L, Q2); blocksize=8)
+    end
+end
+
+@testset "lq_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    for n in (37, m, 63)
+        minmn = min(m, n)
+        A = CuArray(randn(rng, T, m, n))
+        L, Q = lq_full(A)
+        @test L isa CuMatrix{T} && size(L) == (m, n)
+        @test Q isa CuMatrix{T} && size(Q) == (n, n)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+
+        Ac = similar(A)
+        L2, Q2 = @constinferred lq_full!(copy!(Ac, A), (L, Q))
+        @test L2 === L
+        @test Q2 === Q
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+
+        # noL
+        noL = similar(A, 0, n)
+        Q2 = similar(Q)
+        lq_full!(copy!(Ac, A), (noL, Q2))
+        @test Q == Q2
+
+        # positive
+        lq_full!(copy!(Ac, A), (L, Q); positive=true)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+        @test all(>=(zero(real(T))), real(diagview(L)))
+        lq_full!(copy!(Ac, A), (noL, Q2); positive=true)
+        @test Q == Q2
+
+        # explicit blocksize
+        lq_full!(copy!(Ac, A), (L, Q); blocksize=1)
+        @test L * Q ≈ A
+        @test isapproxone(Q * Q')
+        lq_full!(copy!(Ac, A), (noL, Q2); blocksize=1)
+        @test Q == Q2
+        if n == m
+            lq_full!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q
+            @test Q ≈ Q2
+            # these do not work because of the in-place Q
+            @test_throws ArgumentError lq_full!(copy!(Q2, A), (L, Q2); blocksize=1)
+            @test_throws ArgumentError lq_full!(copy!(Q2, A), (noL, Q2); positive=true)
+        end
+        # no blocked CUDA
+        @test_throws ArgumentError lq_full!(copy!(Ac, A), (L, Q); blocksize=8)
+    end
+end
diff --git a/test/cuda/qr.jl b/test/cuda/qr.jl
new file mode 100644
index 00000000..4ada4565
--- /dev/null
+++ b/test/cuda/qr.jl
@@ -0,0 +1,122 @@
+using MatrixAlgebraKit
+using MatrixAlgebraKit: diagview
+using Test
+using TestExtras
+using StableRNGs
+using CUDA
+
+include("utilities.jl")
+
+@testset "qr_compact! and qr_null! for T = $T" for T in (Float32, Float64, ComplexF32,
+                                                         ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    for n in (37, m, 63)
+        minmn = min(m, n)
+        A = CuArray(randn(rng, T, m, n))
+        Q, R = @constinferred qr_compact(A)
+        @test Q isa CuMatrix{T} && size(Q) == (m, minmn)
+        @test R isa CuMatrix{T} && size(R) == (minmn, n)
+        @test Q * R ≈ A
+        N = @constinferred qr_null(A)
+        @test N isa CuMatrix{T} && size(N) == (m, m - minmn)
+        @test isapproxone(Q' * Q)
+        @test maximum(abs, A' * N) < eps(real(T))^(2 / 3)
+        @test isapproxone(N' * N)
+
+        Ac = similar(A)
+        Q2, R2 = @constinferred qr_compact!(copy!(Ac, A), (Q, R))
+        @test Q2 === Q
+        @test R2 === R
+        N2 = @constinferred qr_null!(copy!(Ac, A), N)
+        @test N2 === N
+
+        # noR
+        Q2 = similar(Q)
+        noR = similar(A, minmn, 0)
+        qr_compact!(copy!(Ac, A), (Q2, noR))
+        @test Q == Q2
+
+        # positive
+        qr_compact!(copy!(Ac, A), (Q, R); positive=true)
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+        @test all(>=(zero(real(T))), real(diagview(R)))
+        qr_compact!(copy!(Ac, A), (Q2, noR); positive=true)
+        @test Q == Q2
+
+        # explicit blocksize
+        qr_compact!(copy!(Ac, A), (Q, R); blocksize=1)
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+        qr_compact!(copy!(Ac, A), (Q2, noR); blocksize=1)
+        @test Q == Q2
+        qr_compact!(copy!(Ac, A), (Q2, noR); blocksize=1)
+        qr_null!(copy!(Ac, A), N; blocksize=1)
+        @test maximum(abs, A' * N) < eps(real(T))^(2 / 3)
+        @test isapproxone(N' * N)
+        if n <= m
+            qr_compact!(copy!(Q2, A), (Q2, noR); blocksize=1) # in-place Q
+            @test Q ≈ Q2
+            # these do not work because of the in-place Q
+            @test_throws ArgumentError qr_compact!(copy!(Q2, A), (Q2, R2))
+            @test_throws ArgumentError qr_compact!(copy!(Q2, A), (Q2, noR); positive=true)
+        end
+        # no blocked CUDA
+        @test_throws ArgumentError qr_compact!(copy!(Ac, A), (Q2, R); blocksize=8)
+    end
+end
+
+@testset "qr_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 63
+    for n in (37, m, 63)
+        minmn = min(m, n)
+        A = CuArray(randn(rng, T, m, n))
+        Q, R = qr_full(A)
+        @test Q isa CuMatrix{T} && size(Q) == (m, m)
+        @test R isa CuMatrix{T} && size(R) == (m, n)
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+
+        Ac = similar(A)
+        Q2 = similar(Q)
+        noR = similar(A, m, 0)
+        Q2, R2 = @constinferred qr_full!(copy!(Ac, A), (Q, R))
+        @test Q2 === Q
+        @test R2 === R
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+        qr_full!(copy!(Ac, A), (Q2, noR))
+        @test Q == Q2
+
+        # noR
+        noR = similar(A, m, 0)
+        Q2 = similar(Q)
+        qr_full!(copy!(Ac, A), (Q2, noR))
+        @test Q == Q2
+
+        # positive
+        qr_full!(copy!(Ac, A), (Q, R); positive=true)
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+        @test all(>=(zero(real(T))), real(diagview(R)))
+        qr_full!(copy!(Ac, A), (Q2, noR); positive=true)
+        @test Q == Q2
+
+        # explicit blocksize
+        qr_full!(copy!(Ac, A), (Q, R); blocksize=1)
+        @test Q * R ≈ A
+        @test isapproxone(Q' * Q)
+        qr_full!(copy!(Ac, A), (Q2, noR); blocksize=1)
+        @test Q == Q2
+        if n == m
+            qr_full!(copy!(Q2, A), (Q2, noR); blocksize=1) # in-place Q
+            @test Q ≈ Q2
+            @test_throws ArgumentError qr_full!(copy!(Q2, A), (Q2, R2))
+            @test_throws ArgumentError qr_full!(copy!(Q2, A), (Q2, noR); positive=true)
+        end
+        # no blocked CUDA
+        @test_throws ArgumentError qr_full!(copy!(Ac, A), (Q, R); blocksize=8)
+    end
+end
diff --git a/test/cuda/svd.jl b/test/cuda/svd.jl
new file mode 100644
index 00000000..66098c4b
--- /dev/null
+++ b/test/cuda/svd.jl
@@ -0,0 +1,120 @@
+using MatrixAlgebraKit
+using MatrixAlgebraKit: diagview
+using LinearAlgebra: Diagonal, isposdef
+using Test
+using TestExtras
+using StableRNGs
+using CUDA
+
+include("utilities.jl")
+
+@testset "svd_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    @testset "size ($m, $n)" for n in (37, m, 63)
+        k = min(m, n)
+        algs = (CUSOLVER_QRIteration(), CUSOLVER_SVDPolar(), CUSOLVER_Jacobi())
+        @testset "algorithm $alg" for alg in algs
+            n > m && alg isa CUSOLVER_QRIteration && continue # not supported
+            minmn = min(m, n)
+            A = CuArray(randn(rng, T, m, n))
+
+            U, S, Vᴴ = svd_compact(A; alg)
+            @test U isa CuMatrix{T} && size(U) == (m, minmn)
+            @test S isa Diagonal{real(T),<:CuVector} && size(S) == (minmn, minmn)
+            @test Vᴴ isa CuMatrix{T} && size(Vᴴ) == (minmn, n)
+            @test U * S * Vᴴ ≈ A
+            @test isapproxone(U' * U)
+            @test isapproxone(Vᴴ * Vᴴ')
+            @test isposdef(S)
+
+            Ac = similar(A)
+            U2, S2, V2ᴴ = @constinferred svd_compact!(copy!(Ac, A), (U, S, Vᴴ), alg)
+            @test U2 === U
+            @test S2 === S
+            @test V2ᴴ === Vᴴ
+            @test U * S * Vᴴ ≈ A
+            @test isapproxone(U' * U)
+            @test isapproxone(Vᴴ * Vᴴ')
+            @test isposdef(S)
+
+            Sd = svd_vals(A, alg)
+            @test CuArray(diagview(S)) ≈ Sd
+            # CuArray is necessary because norm of CuArray view with non-unit step is broken
+        end
+    end
+end
+
+@testset "svd_full! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    rng = StableRNG(123)
+    m = 54
+    @testset "size ($m, $n)" for n in (37, m, 63)
+        algs = (CUSOLVER_QRIteration(), CUSOLVER_SVDPolar(), CUSOLVER_Jacobi())
+        @testset "algorithm $alg" for alg in algs
+            n > m && alg isa CUSOLVER_QRIteration && continue # not supported
+            A = CuArray(randn(rng, T, m, n))
+            U, S, Vᴴ = svd_full(A; alg)
+            @test U isa CuMatrix{T} && size(U) == (m, m)
+            @test S isa CuMatrix{real(T)} && size(S) == (m, n)
+            @test Vᴴ isa CuMatrix{T} && size(Vᴴ) == (n, n)
+            @test U * S * Vᴴ ≈ A
+            @test isapproxone(U' * U)
+            @test isapproxone(U * U')
+            @test isapproxone(Vᴴ * Vᴴ')
+            @test isapproxone(Vᴴ' * Vᴴ)
+            @test all(isposdef, diagview(S))
+
+            Ac = similar(A)
+            U2, S2, V2ᴴ = @constinferred svd_full!(copy!(Ac, A), (U, S, Vᴴ), alg)
+            @test U2 === U
+            @test S2 === S
+            @test V2ᴴ === Vᴴ
+            @test U * S * Vᴴ ≈ A
+            @test isapproxone(U' * U)
+            @test isapproxone(U * U')
+            @test isapproxone(Vᴴ * Vᴴ')
+            @test isapproxone(Vᴴ' * Vᴴ)
+            @test all(isposdef, diagview(S))
+
+            Sc = similar(A, real(T), min(m, n))
+            Sc2 = svd_vals!(copy!(Ac, A), Sc, alg)
+            @test Sc === Sc2
+            @test CuArray(diagview(S)) ≈ Sc
+            # CuArray is necessary because norm of CuArray view with non-unit step is broken
+        end
+    end
+end
+
+# @testset "svd_trunc! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+#     rng = StableRNG(123)
+#     m = 54
+#     if LinearAlgebra.LAPACK.version() < v"3.12.0"
+#         algs = (LAPACK_DivideAndConquer(), LAPACK_QRIteration(), LAPACK_Bisection())
+#     else
+#         algs = (LAPACK_DivideAndConquer(), LAPACK_QRIteration(), LAPACK_Bisection(),
+#                 LAPACK_Jacobi())
+#     end
+
+#     @testset "size ($m, $n)" for n in (37, m, 63)
+#         @testset "algorithm $alg" for alg in algs
+#             n > m && alg isa LAPACK_Jacobi && continue # not supported
+#             A = randn(rng, T, m, n)
+#             S₀ = svd_vals(A)
+#             minmn = min(m, n)
+#             r = minmn - 2
+
+#             U1, S1, V1ᴴ = @constinferred svd_trunc(A; alg, trunc=truncrank(r))
+#             @test length(S1.diag) == r
+#             @test LinearAlgebra.opnorm(A - U1 * S1 * V1ᴴ) ≈ S₀[r + 1]
+
+#             s = 1 + sqrt(eps(real(T)))
+#             trunc2 = trunctol(s * S₀[r + 1])
+
+#             U2, S2, V2ᴴ = @constinferred svd_trunc(A; alg, trunc=trunctol(s * S₀[r + 1]))
+#             @test length(S2.diag) == r
+#             @test U1 ≈ U2
+#             @test S1 ≈ S2
+#             @test V1ᴴ ≈ V2ᴴ
+#         end
+#     end
+# end
diff --git a/test/cuda/utilities.jl b/test/cuda/utilities.jl
new file mode 100644
index 00000000..61518b55
--- /dev/null
+++ b/test/cuda/utilities.jl
@@ -0,0 +1,3 @@
+function isapproxone(A)
+    return (size(A, 1) == size(A, 2)) && (A ≈ MatrixAlgebraKit.one!(similar(A)))
+end
diff --git a/test/lq.jl b/test/lq.jl
index f12ea99a..581f1782 100644
--- a/test/lq.jl
+++ b/test/lq.jl
@@ -3,6 +3,7 @@ using Test
 using TestExtras
 using StableRNGs
 using LinearAlgebra: diag, I
+using MatrixAlgebraKit: LQViaTransposedQR, LAPACK_HouseholderQR
 
 @testset "lq_compact! for T = $T" for T in (Float32, Float64, ComplexF32, ComplexF64)
     rng = StableRNG(123)
@@ -32,6 +33,19 @@ using LinearAlgebra: diag, I
         lq_compact!(copy!(Ac, A), (noL, Q2))
         @test Q == Q2
 
+        # Transposed QR algorithm
+        qr_alg = LAPACK_HouseholderQR()
+        lq_alg = LQViaTransposedQR(qr_alg)
+        L2, Q2 = @constinferred lq_compact!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L2 === L
+        @test Q2 === Q
+        Nᴴ2 = @constinferred lq_null!(copy!(Ac, A), Nᴴ, lq_alg)
+        @test Nᴴ2 === Nᴴ
+        noL = similar(A, 0, minmn)
+        Q2 = similar(Q)
+        lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
+
         # unblocked algorithm
         lq_compact!(copy!(Ac, A), (L, Q); blocksize=1)
         @test L * Q ≈ A
@@ -56,8 +70,22 @@ using LinearAlgebra: diag, I
         lq_null!(copy!(Ac, A), Nᴴ; blocksize=8)
         @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3)
         @test isisometry(Nᴴ; side=:right)
+        @test Nᴴ * Nᴴ' ≈ I
+
+        qr_alg = LAPACK_HouseholderQR(; blocksize=1)
+        lq_alg = LQViaTransposedQR(qr_alg)
+        lq_compact!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L * Q ≈ A
+        @test Q * Q' ≈ I
+        lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
+        lq_null!(copy!(Ac, A), Nᴴ, lq_alg)
+        @test maximum(abs, A * Nᴴ') < eps(real(T))^(2 / 3)
+        @test Nᴴ * Nᴴ' ≈ I
+
         # pivoted
         @test_throws ArgumentError lq_compact!(copy!(Ac, A), (L, Q); pivoted=true)
+
         # positive
         lq_compact!(copy!(Ac, A), (L, Q); positive=true)
         @test L * Q ≈ A
@@ -65,6 +93,7 @@ using LinearAlgebra: diag, I
         @test all(>=(zero(real(T))), real(diag(L)))
         lq_compact!(copy!(Ac, A), (noL, Q2); positive=true)
         @test Q == Q2
+
         # positive and blocksize 1
         lq_compact!(copy!(Ac, A), (L, Q); positive=true, blocksize=1)
         @test L * Q ≈ A
@@ -72,6 +101,14 @@ using LinearAlgebra: diag, I
         @test all(>=(zero(real(T))), real(diag(L)))
         lq_compact!(copy!(Ac, A), (noL, Q2); positive=true, blocksize=1)
         @test Q == Q2
+        qr_alg = LAPACK_HouseholderQR(; positive=true, blocksize=1)
+        lq_alg = LQViaTransposedQR(qr_alg)
+        lq_compact!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L * Q ≈ A
+        @test Q * Q' ≈ I
+        @test all(>=(zero(real(T))), real(diag(L)))
+        lq_compact!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
     end
 end
 
@@ -99,6 +136,19 @@ end
         lq_full!(copy!(Ac, A), (noL, Q2))
         @test Q == Q2
 
+        # Transposed QR algorithm
+        qr_alg = LAPACK_HouseholderQR()
+        lq_alg = LQViaTransposedQR(qr_alg)
+        L2, Q2 = @constinferred lq_full!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L2 === L
+        @test Q2 === Q
+        @test L * Q ≈ A
+        @test Q * Q' ≈ I
+        noL = similar(A, 0, n)
+        Q2 = similar(Q)
+        lq_full!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
+
         # unblocked algorithm
         lq_full!(copy!(Ac, A), (L, Q); blocksize=1)
         @test L * Q ≈ A
@@ -109,7 +159,19 @@ end
             lq_full!(copy!(Q2, A), (noL, Q2); blocksize=1) # in-place Q
             @test Q ≈ Q2
         end
-        # # other blocking
+        qr_alg = LAPACK_HouseholderQR(; blocksize=1)
+        lq_alg = LQViaTransposedQR(qr_alg)
+        lq_full!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L * Q ≈ A
+        @test Q * Q' ≈ I
+        lq_full!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
+        if n == m
+            lq_full!(copy!(Q2, A), (noL, Q2), lq_alg) # in-place Q
+            @test Q ≈ Q2
+        end
+
+        # other blocking
         lq_full!(copy!(Ac, A), (L, Q); blocksize=18)
         @test L * Q ≈ A
         @test isunitary(Q)
@@ -124,6 +186,16 @@ end
         @test all(>=(zero(real(T))), real(diag(L)))
         lq_full!(copy!(Ac, A), (noL, Q2); positive=true)
         @test Q == Q2
+
+        qr_alg = LAPACK_HouseholderQR(; positive=true)
+        lq_alg = LQViaTransposedQR(qr_alg)
+        lq_full!(copy!(Ac, A), (L, Q), lq_alg)
+        @test L * Q ≈ A
+        @test Q * Q' ≈ I
+        @test all(>=(zero(real(T))), real(diag(L)))
+        lq_full!(copy!(Ac, A), (noL, Q2), lq_alg)
+        @test Q == Q2
+
         # positive and blocksize 1
         lq_full!(copy!(Ac, A), (L, Q); positive=true, blocksize=1)
         @test L * Q ≈ A
diff --git a/test/runtests.jl b/test/runtests.jl
index 5f0d2991..13f33f09 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,46 +1,64 @@
 using SafeTestsets
 
-@safetestset "Algorithms" begin
-    include("algorithms.jl")
-end
-@safetestset "Truncate" begin
-    include("truncate.jl")
-end
-@safetestset "QR / LQ Decomposition" begin
-    include("qr.jl")
-    include("lq.jl")
-end
-@safetestset "Singular Value Decomposition" begin
-    include("svd.jl")
-end
-@safetestset "Hermitian Eigenvalue Decomposition" begin
-    include("eigh.jl")
-end
-@safetestset "General Eigenvalue Decomposition" begin
-    include("eig.jl")
-end
-@safetestset "Schur Decomposition" begin
-    include("schur.jl")
-end
-@safetestset "Polar Decomposition" begin
-    include("polar.jl")
-end
-@safetestset "Image and Null Space" begin
-    include("orthnull.jl")
-end
-@safetestset "ChainRules" begin
-    include("chainrules.jl")
+# don't run all tests on GPU, only the GPU
+# specific ones
+is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
+if !is_buildkite
+    @safetestset "Algorithms" begin
+        include("algorithms.jl")
+    end
+    @safetestset "Truncate" begin
+        include("truncate.jl")
+    end
+    @safetestset "QR / LQ Decomposition" begin
+        include("qr.jl")
+        include("lq.jl")
+    end
+    @safetestset "Singular Value Decomposition" begin
+        include("svd.jl")
+    end
+    @safetestset "Hermitian Eigenvalue Decomposition" begin
+        include("eigh.jl")
+    end
+    @safetestset "General Eigenvalue Decomposition" begin
+        include("eig.jl")
+    end
+    @safetestset "Schur Decomposition" begin
+        include("schur.jl")
+    end
+    @safetestset "Polar Decomposition" begin
+        include("polar.jl")
+    end
+    @safetestset "Image and Null Space" begin
+        include("orthnull.jl")
+    end
+    @safetestset "ChainRules" begin
+        include("chainrules.jl")
+    end
+    @safetestset "MatrixAlgebraKit.jl" begin
+        @safetestset "Code quality (Aqua.jl)" begin
+            using MatrixAlgebraKit
+            using Aqua
+            Aqua.test_all(MatrixAlgebraKit)
+        end
+        @safetestset "Code linting (JET.jl)" begin
+            using MatrixAlgebraKit
+            using JET
+            JET.test_package(MatrixAlgebraKit; target_defined_modules=true)
+        end
+    end
 end
 
-@safetestset "MatrixAlgebraKit.jl" begin
-    @safetestset "Code quality (Aqua.jl)" begin
-        using MatrixAlgebraKit
-        using Aqua
-        Aqua.test_all(MatrixAlgebraKit)
+using CUDA
+if CUDA.functional()
+    @safetestset "CUDA QR" begin
+        include("cuda/qr.jl")
     end
-    @safetestset "Code linting (JET.jl)" begin
-        using MatrixAlgebraKit
-        using JET
-        JET.test_package(MatrixAlgebraKit; target_defined_modules=true)
+    @safetestset "CUDA LQ" begin
+        include("cuda/lq.jl")
+    end
+    @safetestset "CUDA SVD" begin
+        include("cuda/svd.jl")
     end
 end
+