JuliaGPU · tam724 · Jan 1, 2026 · Jan 1, 2026 · Jan 1, 2026
diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
@@ -8,7 +8,7 @@ export bmm!
     mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix, X::CuVector, beta::Number, Y::CuVector, index::SparseChar)
 
 Performs `Y = alpha * op(A) * X + beta * Y`, where `op` can be nothing (`transa = N`),
-tranpose (`transa = T`) or conjugate transpose (`transa = C`).
+transpose (`transa = T`) or conjugate transpose (`transa = C`).
 `X` and `Y` are dense vectors.
 """
 function mv! end
@@ -18,7 +18,7 @@ function mv! end
     mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuMatrix, B::Union{CuSparseMatrixCSC,CuSparseMatrixCSR,CuSparseMatrixCOO}, beta::Number, C::CuMatrix, index::SparseChar)
 
 Performs `C = alpha * op(A) * op(B) + beta * C`, where `op` can be nothing (`transa = N`),
-tranpose (`transa = T`) or conjugate transpose (`transa = C`).
+transpose (`transa = T`) or conjugate transpose (`transa = C`).
 """
 function mm! end
 
@@ -273,7 +273,7 @@ function mv!(transa::SparseChar, alpha::Number, A::Union{CuSparseMatrixCSC{TA},C
 end
 
 function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrix{T},
-             B::DenseCuMatrix{T}, beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
+             B::Union{DenseCuMatrix{T}, Transpose{T, <:DenseCuMatrix{T}}}, beta::Number, C::Union{DenseCuMatrix{T}, Transpose{T, <:DenseCuMatrix{T}}}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
 
     (A isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.5.1") && throw(ErrorException("This operation is not supported by the current CUDA version."))
 
@@ -298,12 +298,6 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseM
     descB = CuDenseMatrixDescriptor(B)
     descC = CuDenseMatrixDescriptor(C)
 
-    # cusparseDnMatSetStridedBatch(descB, size(B,3), size(B,1)*size(B,2))
-    # cusparseDnMatSetStridedBatch(descB, size(B,3), size(B,1)*size(B,2))
-    # batchsize = length(nonzeros(A)) ÷ nnz(A)
-    # if batchsize > 1
-    #     cusparseCsrSetStridedBatch(obj, batchsize, 0, nnz(A))
-    # end
 
     # Set default buffer for small matrices (1000 chosen arbitrarly)
     # Otherwise tries to allocate 120TB of memory (see #2296)
@@ -327,6 +321,28 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseM
     return C
 end
 
+function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T}, B::CuSparseMatrixCSC{T}, beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where T
+    _B = CuSparseMatrixCSR{T}(B.colPtr, B.rowVal, B.nzVal, reverse(B.dims))
+    mm!(transb, transa, alpha, _B, transpose(A), beta, transpose(C), index, algo)
+    return C
+end
+
+function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T}, B::CuSparseMatrixCSR{T}, beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where T
+    _B = CuSparseMatrixCSC{T}(B.rowPtr, B.colVal, B.nzVal, reverse(B.dims))
+    mm!(transb, transa, alpha, _B, transpose(A), beta, transpose(C), index, algo)
+    return C
+end
+
+function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T}, B::CuSparseMatrixCOO{T}, beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where T
+    if T <: Real || transb == 'N' || transb == 'T'
+        mm!((transb == 'N') ? 'T' : 'N', transa, alpha, B, transpose(A), beta, transpose(C), index, algo)
+    else # transb == 'C'
+        _B = CuSparseMatrixCOO{T}(B.rowInd, B.colInd, conj.(B.nzVal), B.dims)
+        mm!('N', transa, alpha, _B, transpose(A), beta, transpose(C), index, algo)
+    end
+    return C
+end
+
 function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseArrayCSR{T,Ti,N},
               B::DenseCuArray{T,N}, beta::Number, C::DenseCuArray{T,N}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T,Ti,N}
     Ar = reshape(A, :, :, :)
@@ -401,60 +417,6 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
     return C
 end
 
-function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T},
-             B::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}}, beta::Number,
-             C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
-
-    # Support transa = 'C' and `transb = 'C' for real matrices
-    transa = T <: Real && transa == 'C' ? 'T' : transa
-    transb = T <: Real && transb == 'C' ? 'T' : transb
-
-    # cusparseSpMM can be also used to perform the multiplication of a dense matrix and a sparse matrix by switching the dense matrices layout:
-    # Cc = α * Ac * B  + β * Cc → α * Bᵀ * Ar + β * Cr
-    # Cc = α * Ac * Bᵀ + β * Cc → α * B  * Ar + β * Cr
-    # Cc = α * Ac * Bᴴ + β * Cc → α * B̅  * Ar + β * Cr
-    # where B is a sparse matrix, Ac and Cc indicate column-major layout, while Ar and Cr refer to row-major layout.
-
-    m,k = size(A)
-    n = size(C)[2]
-
-    if transa == 'N' && transb == 'N'
-        chkmmdims(B,C,k,n,m,n)
-    elseif transa == 'N' && transb != 'N'
-        chkmmdims(B,C,n,k,m,n)
-    elseif transa != 'N' && transb == 'N'
-        chkmmdims(B,C,m,n,k,n)
-    elseif transa != 'N' && transb != 'N'
-        chkmmdims(B,C,n,m,k,n)
-    end
-
-    descA = CuDenseMatrixDescriptor(A, transposed=true)
-    descB = CuSparseMatrixDescriptor(B, index, transposed=true)
-    descC = CuDenseMatrixDescriptor(C, transposed=true)
-
-    # Set default buffer for small matrices (1000 chosen arbitrarly)
-    # Otherwise tries to allocate 120TB of memory (see #2296)
-    function bufferSize()
-        out = Ref{Csize_t}(1000)
-        cusparseSpMM_bufferSize(
-            handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
-            descC, T, algo, out)
-        return out[]
-    end
-    with_workspace(bufferSize) do buffer
-        # We should find a way to reuse the buffer (issue #1362)
-        if !(B isa CuSparseMatrixCOO)
-            cusparseSpMM_preprocess(
-                handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
-                descC, T, algo, buffer)
-        end
-        cusparseSpMM(
-            handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
-            descC, T, algo, buffer)
-    end
-    return C
-end
-
 # AB and C must have the same sparsity pattern if β ≠ 0.
 function gemm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrixCSR{T}, B::CuSparseMatrixCSR{T},
                beta::Number, C::CuSparseMatrixCSR{T}, index::SparseChar, algo::cusparseSpGEMMAlg_t=CUSPARSE_SPGEMM_DEFAULT) where {T}

diff --git a/lib/cusparse/helpers.jl b/lib/cusparse/helpers.jl
@@ -103,6 +103,8 @@ mutable struct CuDenseMatrixDescriptor
         obj
     end
 
+    CuDenseMatrixDescriptor(At::Transpose{<:Any, <:DenseCuMatrix}) = CuDenseMatrixDescriptor(parent(At), transposed=true) 
+
     function CuDenseMatrixDescriptor(A::DenseCuArray{T, 3}; transposed::Bool=false) where T
         desc_ref = Ref{cusparseDnMatDescr_t}()
         if transposed

diff --git a/lib/cusparse/interfaces.jl b/lib/cusparse/interfaces.jl
@@ -34,18 +34,6 @@ function mv_wrapper(transa::SparseChar, alpha::Number, A::CuSparseMatrix, X::Den
     mv!(transa, alpha, A, X, beta, Y, 'O')
 end
 
-function mm_wrapper(transa::SparseChar, transb::SparseChar, alpha::Number,
-                    A::CuSparseMatrix{T}, B::CuMatrix{T}, beta::Number, C::CuMatrix{T}) where {T}
-    n_A, m_A = (transa != 'N') ? reverse(size(A)) : size(A)
-    n_B, m_B = (transb != 'N') ? reverse(size(B)) : size(B)
-    n_C, m_C = size(C)
-    m_A == n_B || throw(DimensionMismatch())
-    n_A == n_C || throw(DimensionMismatch())
-    m_B == m_C || throw(DimensionMismatch())
-    isempty(B) && return CUDA.zeros(eltype(B), size(A, 1), 0)
-    mm!(transa, transb, alpha, A, B, beta, C, 'O')
-end
-
 LinearAlgebra.dot(x::CuSparseVector{T}, y::DenseCuVector{T}) where {T <: BlasReal} = vv!('N', x, y, 'O')
 LinearAlgebra.dot(x::DenseCuVector{T}, y::CuSparseVector{T}) where {T <: BlasReal} = dot(y, x)
 
@@ -82,7 +70,7 @@ end
 function LinearAlgebra.generic_matmatmul!(C::CuMatrix{T}, tA, tB, A::CuSparseMatrix{T}, B::DenseCuMatrix{T}, alpha::Number, beta::Number) where {T <: Union{Float16, ComplexF16, BlasFloat}}
     tA = tA in ('S', 's', 'H', 'h') ? 'N' : tA
     tB = tB in ('S', 's', 'H', 'h') ? 'N' : tB
-    mm_wrapper(tA, tB, alpha, A, B, beta, C)
+    mm!(tA, tB, alpha, A, B, beta, C, 'O')
 end
 
 for (wrapa, transa, unwrapa) in op_wrappers

diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl
@@ -83,20 +83,38 @@ for SparseMatrixType in keys(SPMM_ALGOS)
         @testset "mm! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
             @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
                 @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
-                    CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue
-                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && (transa != 'N' || transb != 'N') && continue
+                    CUSPARSE.version() < v"12.5.8" && algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && (transa != 'N' || transb != 'N') && continue
+                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && transa != 'N' && continue # https://docs.nvidia.com/cuda/cusparse/index.html#cusparsespmm: CSR_ALG3 supports only 'NON_TRANSPOSE'
+                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && transb == 'C' && continue # https://docs.nvidia.com/cuda/cusparse/index.html#cusparsespmm: CSR_ALG3 does not support 'CONJUGATE_TRANSPOSE'
                     (SparseMatrixType == CuSparseMatrixBSR) && (transa != 'N') && continue
                     A = sprand(T, 10, 10, 0.1)
                     B = transb == 'N' ? rand(T, 10, 2) : rand(T, 2, 10)
+                    Bt = collect(transpose(B))
                     C = rand(T, 10, 2)
+                    Ct = collect(transpose(C))
                     dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A,1) : SparseMatrixType(A)
                     dB = CuArray(B)
-                    dC = CuArray(C)
+                    dBt = CuArray(Bt)
 
                     alpha = rand(T)
                     beta = rand(T)
+
+                    dC = CuArray(C)
                     mm!(transa, transb, alpha, dA, dB, beta, dC, 'O', algo)
                     @test alpha * opa(A) * opb(B) + beta * C ≈ collect(dC)
+
+                    dCt = CuArray(Ct)
+                    mm!(transa, transb, alpha, dA, transpose(dBt), beta, transpose(dCt), 'O', algo)
+                    @test alpha * opa(A) * opb(B) + beta * C ≈ transpose(collect(dCt))
+
+                    CUSPARSE.version() < v"12.5.8" && continue
+                    dC = CuArray(C)
+                    mm!(transa, transb, alpha, dA, transpose(dBt), beta, dC, 'O', algo)
+                    @test alpha * opa(A) * opb(B) + beta * C ≈ collect(dC)
+
+                    dCt = CuArray(Ct)
+                    mm!(transa, transb, alpha, dA, dB, beta, transpose(dCt), 'O', algo)
+                    @test alpha * opa(A) * opb(B) + beta * C ≈ transpose(collect(dCt))
                 end
             end
         end
@@ -122,16 +140,14 @@ for SparseMatrixType in keys(SPMM_ALGOS)
         @testset "$T" for T in [Float32, Float64, ComplexF32, ComplexF64]
             @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
                 @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
-                    CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSR && T <: Complex && transb == 'C' && continue
-                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && (transa != 'N' || transb != 'N') && continue
+                    CUSPARSE.version() < v"12.5.8" && algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && continue
+                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && transb != 'N' && continue # https://docs.nvidia.com/cuda/cusparse/index.html#cusparsespmm: CSR_ALG3 supports only 'NON_TRANSPOSE'
+                    algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && transa == 'C' && continue # https://docs.nvidia.com/cuda/cusparse/index.html#cusparsespmm: CSR_ALG3 does not support 'CONJUGATE_TRANSPOSE'
                     A = rand(T, 10, 10)
                     B = transb == 'N' ? sprand(T, 10, 5, 0.5) : sprand(T, 5, 10, 0.5)
                     C = rand(T, 10, 5)
                     dA = CuArray(A)
                     dB = SparseMatrixType(B)
-                    if SparseMatrixType == CuSparseMatrixCOO
-                        dB = sort_coo(dB, 'C')
-                    end
                     dC = CuArray(C)
 
                     alpha = rand(T)
@@ -141,8 +157,8 @@ for SparseMatrixType in keys(SPMM_ALGOS)
 
                     # add tests for very small matrices (see #2296)
                     # skip conjugate transpose - causes errors with 1x1 matrices
-                    # CUSPARSE_SPMM_CSR_ALG3 also fails
-                    (algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 || transa == 'C') && continue
+                    CUSPARSE.version() < v"12.6.3" && algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && continue
+                    transa == 'C' && continue
                     A = rand(T, 1, 1)
                     B = sprand(T, 1, 1, 1.)
                     C = rand(T, 1, 1)

diff --git a/test/libraries/cusparse/interfaces.jl b/test/libraries/cusparse/interfaces.jl
@@ -74,9 +74,6 @@ nB = 2
                 B  = opb == identity ? sprand(elty, k, n, 0.2) : sprand(elty, n, k, 0.2)
                 for SparseMatrixType in (CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO)
                     dB = SparseMatrixType(B)
-                    if SparseMatrixType == CuSparseMatrixCOO
-                        dB = sort_coo(dB, 'C')
-                    end
                     @testset "CuMatrix * $SparseMatrixType" begin
                         @testset "A * B" begin
                             C  = opa(A) * opb(B)