JuliaSparse
diff --git a/‎README.md
Lines changed: 3 additions & 2 deletions b/‎README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎gen/wrapper.jl
Lines changed: 1 addition & 1 deletion b/‎gen/wrapper.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MKLSparse.jl
Lines changed: 5 additions & 0 deletions b/‎src/MKLSparse.jl
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/generic.jl
Lines changed: 249 additions & 0 deletions b/‎src/generic.jl
Lines changed: 249 additions & 0 deletions
@@ -2,8 +2,9 @@
 
 [![codecov](https://codecov.io/gh/JuliaSparse/MKLSparse.jl/graph/badge.svg?token=j3KoKBEIt1)](https://codecov.io/gh/JuliaSparse/MKLSparse.jl)
 
-`MKLSparse.jl` is a Julia package to seamlessly use the sparse functionality in MKL to speed up operations on sparse arrays in Julia.
-In order to use `MKLSparse.jl` you do not need to install Intel's MKL library nor build Julia with MKL. `MKLSparse.jl` will automatically download and use the MKL library for you when installed.
+*MKLSparse.jl* is a Julia package to seamlessly use the [sparse BLAS routines from Intel's Math Kernel Library (MKL)](https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c)
+to speed up operations on sparse arrays in Julia.
+In order to use *MKLSparse.jl* you do not need to install Intel's MKL library nor build Julia with MKL. *MKLSparse.jl* will automatically download and use the MKL library for you when installed.
 
 ### Matrix multiplications
 
 
@@ -24,7 +24,7 @@ function wrapper(name::String, headers::Vector{String}, optimized::Bool=false)
 
   args = get_default_args()
   push!(args, "-I$include_dir")
-  
+
   ctx = create_context(headers, args, options)
   build!(ctx)
 
 
@@ -13,6 +13,8 @@ function _log_mklsparse_call(fname)
     __mklsparse_calls_count[] += 1
 end
 
+# common MKL definitions from mkl_service.h, see also MKL.jl
+
 @enum Threading begin
     THREADING_INTEL
     THREADING_SEQUENTIAL
@@ -39,13 +41,16 @@ function set_interface_layer(interface::Interface = INTERFACE_LP64)
     return nothing
 end
 
+# initialize the MKL interface upon module initialization
+# NOTE: this sets the interface for all MKL API calls, not just the sparse ones
 function __init__()
     set_interface_layer(INTERFACE_LP64)
 end
 
 # Wrappers generated by Clang.jl
 include("libmklsparse.jl")
 include("types.jl")
+include("utils.jl")
 include("mklsparsematrix.jl")
 
 # TODO: BLAS1
 
@@ -56,6 +56,255 @@ function mm!(transA::Char, alpha::T, A::AbstractSparseMatrix{T}, descr::matrix_d
     return C
 end
 
+# C := op(A) * B, where C is sparse
+function spmm(transA::Char, A::S, B::S) where {S <: AbstractSparseMatrix}
+    check_trans(transA)
+    check_mat_op_sizes(nothing, A, transA, B, 'N')
+    Cout = Ref{sparse_matrix_t}()
+    hA = MKLSparseMatrix(A)
+    hB = MKLSparseMatrix(B)
+    res = mkl_call(Val{:mkl_sparse_spmmI}(), S,
+                   transA, hA, hB, Cout)
+    destroy(hA)
+    destroy(hB)
+    check_status(res)
+    # NOTE: we are guessing what is the storage format of C
+    hC = MKLSparseMatrix{S}(Cout[])
+    C = convert(S, hC)
+    destroy(hC)
+    return C
+end
+
+# C := op(A) * B, where C is dense
+function spmmd!(transa::Char, A::S, B::S,
+                C::StridedMatrix{T};
+                dense_layout::sparse_layout_t = SPARSE_LAYOUT_COLUMN_MAJOR
+) where {S <: AbstractSparseMatrix{T}} where T
+    check_trans(transa)
+    check_mat_op_sizes(C, A, transa, B, 'N')
+    ldC = stride(C, 2)
+    hA = MKLSparseMatrix(A)
+    hB = MKLSparseMatrix(B)
+    res = mkl_call(Val{:mkl_sparse_T_spmmdI}(), S,
+                   transa, hA, hB, dense_layout, C, ldC)
+    destroy(hA)
+    destroy(hB)
+    check_status(res)
+    return C
+end
+
+# C := opA(A) * opB(B), where C is sparse
+function sp2m(transA::Char, A::S, descrA::matrix_descr,
+              transB::Char, B::S, descrB::matrix_descr
+) where S <: AbstractSparseMatrix
+    check_trans(transA)
+    check_trans(transB)
+    check_mat_op_sizes(nothing, A, transA, B, transB)
+    Cout = Ref{sparse_matrix_t}()
+    hA = MKLSparseMatrix(A)
+    hB = MKLSparseMatrix(B)
+    res = mkl_call(Val{:mkl_sparse_sp2mI}(), S,
+                   transA, descrA, hA, transB, descrB, hB,
+                   SPARSE_STAGE_FULL_MULT, Cout)
+    destroy(hA)
+    destroy(hB)
+    check_status(res)
+    # NOTE: we are guessing what is the storage format of C
+    hC = MKLSparseMatrix{S}(Cout[])
+    C = convert(S, hC)
+    destroy(hC)
+    return C
+end
+
+# C := opA(A) * opB(B), where C is sparse, in-place version
+# C should have the correct size and sparsity pattern
+function sp2m!(transA::Char, A::S, descrA::matrix_descr,
+               transB::Char, B::S, descrB::matrix_descr,
+               C::S;
+               check_nzpattern::Bool = true
+) where {S <: AbstractSparseMatrix}
+    check_trans(transA)
+    check_trans(transB)
+    check_mat_op_sizes(C, A, transA, B, transB)
+    hA = MKLSparseMatrix(A)
+    hB = MKLSparseMatrix(B)
+    if check_nzpattern
+        # pre-multiply A * B to get the number of nonzeros per column in the result
+        CptnOut = Ref{sparse_matrix_t}()
+        res = mkl_call(Val{:mkl_sparse_sp2mI}(), S,
+                    transA, descrA, hA, transB, descrB, hB,
+                    SPARSE_STAGE_NNZ_COUNT, CptnOut)
+        check_status(res)
+        hCptn = MKLSparseMatrix{S}(CptnOut[])
+        try
+            # check if C has the same per-column nonzeros as the result
+            MKLSparse.check_nzpattern(C, hCptn)
+        catch e
+            # destroy handles to A and B if the pattern check fails,
+            # otherwise reuse them at the actual multiplication
+            destroy(hA)
+            destroy(hB)
+            rethrow(e)
+        finally
+            destroy(hCptn)
+        end
+        # FIXME rowval not checked
+    end
+    # FIXME the optimal way would be to create the MKLSparse handle to C reusing its arrays
+    # and do SPARSE_STAGE_FINALIZE_MULT to directly write to the C.nzval
+    # but that causes segfaults when the handle is destroyed
+    # (also the partial mkl_sparse_copy(C) workaround to reuse the nz structure segfaults)
+    # see the note stating that external memory is not currently supported:
+    # https://www.intel.com/content/www/us/en/docs/onemkl/developer-reference-c/2024-2/two-stage-algorithm-in-inspect-execute-sp-blas.html
+    #hC = MKLSparseMatrix(C)
+    #mkl_call(Val{:mkl_sparse_set_memory_hintI}(), typeof(C), SPARSE_MEMORY_NONE)
+    #hC_ref = Ref(hC)
+    #res = mkl_call(Val{:mkl_sparse_sp2mI}(), typeof(A),
+    #               transA, descrA, hA, transB, descrB, hB,
+    #               SPARSE_STAGE_FINALIZE_MULT, hC_ref)
+    #@assert hC_ref[] == hC
+    # so instead we do the full multiplication and copy the result into C nzvals
+    hCopy_ref = Ref{sparse_matrix_t}()
+    # don't log if it's the 2nd mkl_call
+    res = mkl_call(Val{:mkl_sparse_sp2mI}(), S,
+                   transA, descrA, hA, transB, descrB, hB,
+                   SPARSE_STAGE_FULL_MULT, hCopy_ref, log = Val(!check_nzpattern))
+    destroy(hA)
+    destroy(hB)
+    #destroy(hC)
+    check_status(res)
+    if hCopy_ref[] != C_NULL
+        hCopy = MKLSparseMatrix{S}(hCopy_ref[])
+        copy!(C, hCopy; check_nzpattern)
+        destroy(hCopy)
+    end
+    return C
+end
+
+# C := alpha * opA(A) * opB(B) + beta * C, where C is dense
+function sp2md!(transA::Char, alpha::T, A::S, descrA::matrix_descr,
+                transB::Char, B::S, descrB::matrix_descr,
+                beta::T, C::StridedMatrix{T};
+                dense_layout::sparse_layout_t = SPARSE_LAYOUT_COLUMN_MAJOR
+) where {S <: AbstractSparseMatrix{T}} where T
+    check_trans(transA)
+    check_trans(transB)
+    check_mat_op_sizes(C, A, transA, B, transB)
+    ldC = stride(C, 2)
+    hA = MKLSparseMatrix(A)
+    hB = MKLSparseMatrix(B)
+    res = mkl_call(Val{:mkl_sparse_T_sp2mdI}(), S,
+                   transA, descrA, hA, transB, descrB, hB,
+                   alpha, beta,
+                   C, dense_layout, ldC)
+    destroy(hA)
+    destroy(hB)
+    check_status(res)
+    return C
+end
+
+# C := A * op(A), or
+# C := op(A) * A, where C is sparse
+# note: only the upper triangular part of C is computed
+function syrk(transA::Char, A::SparseMatrixCSR; copytri::Bool = false)
+    copytri && error("syrk() wrapper does not implement copytri=true")
+    check_trans(transA)
+    Cout = Ref{sparse_matrix_t}()
+    hA = MKLSparseMatrix(A)
+    res = mkl_call(Val{:mkl_sparse_syrkI}(), typeof(A),
+                   transA, hA, Cout)
+    destroy(hA)
+    check_status(res)
+    # NOTE: we are guessing what is the storage format of C
+    hC = MKLSparseMatrix{typeof(A)}(Cout[])
+    C = convert(typeof(A), hC)
+    destroy(hC)
+    return C
+end
+
+# CSC is not supported by SparseMKL directly, so treat A as Aᵀ in CSR format
+# note: only the lower triangular part of C is computed (lower CSC = upper CSR)
+function syrk(transA::Char, A::SparseMatrixCSC{T}; kwargs...) where T
+    C = syrk(dual_opcode(T, transA),
+             convert(SparseMatrixCSR, transpose(A)); kwargs...)
+    return convert(typeof(A), transpose(C))
+end
+
+# C := A * op(A), or
+# C := op(A) * A, where C is dense
+# note: only the upper triangular part of C is computed
+function syrkd!(transA::Char, alpha::T, A::SparseMatrixCSR{T}, beta::T,
+                C::StridedMatrix{T};
+                dense_layout::sparse_layout_t = SPARSE_LAYOUT_COLUMN_MAJOR,
+                copytri::Bool = true
+) where T
+    check_trans(transA)
+    check_mat_op_sizes(C, A, transA, A, transA == 'N' ? 'T' : 'N'; dense_layout)
+    ldC = stride(C, 2)
+    hA = MKLSparseMatrix(A)
+    res = mkl_call(Val{:mkl_sparse_T_syrkdI}(), typeof(A),
+                   transA, hA, alpha, beta, C, dense_layout, ldC)
+    destroy(hA)
+    check_status(res)
+    copytri && fastcopytri!(C, dense_layout == SPARSE_LAYOUT_COLUMN_MAJOR ? 'U' : 'L',
+                            T <: Complex)
+    return C
+end
+
+# CSC is not supported by SparseMKL directly, so treat A as Aᵀ in CSR format
+function syrkd!(transA::Char, alpha::T, A::SparseMatrixCSC{T}, beta::T,
+                C::StridedMatrix{T}; kwarg...
+) where T
+    # since CSC support is implemented by transposing A, the A*A' has to be conjugated
+    # to be correct in the complex case, that produces incorrect results when beta != 0
+    (T <: Complex) && error("syrkd!() wrapper does not support SparseMatrixCSC with complex values")
+    syrkd!(dual_opcode(T, transA), alpha,
+           convert(SparseMatrixCSR, transpose(A)), beta, C; kwarg...)
+end
+
+# C := alpha * op(A) * B * A + beta * C, or
+# C := alpha * A * B * op(A) + beta * C, where C is dense
+# note: only the upper triangular part of C is computed
+function syprd!(transA::Char, alpha::T, A::SparseMatrixCSR{T},
+                B::StridedMatrix{T}, beta::T, C::StridedMatrix{T};
+                dense_layout_B::sparse_layout_t = SPARSE_LAYOUT_COLUMN_MAJOR,
+                dense_layout_C::sparse_layout_t = SPARSE_LAYOUT_COLUMN_MAJOR,
+                copytri::Bool = true
+) where T
+    check_trans(transA)
+    # FIXME dense_layout_B not used
+    check_mat_op_sizes(C, A, transA, B, 'N';
+                       check_result_columns = false, dense_layout = dense_layout_C)
+    check_mat_op_sizes(C, B, 'N', A, transA == 'N' ? 'T' : 'N';
+                       check_result_rows = false, dense_layout = dense_layout_C)
+    ldB = stride(B, 2)
+    ldC = stride(C, 2)
+    hA = MKLSparseMatrix(A)
+    res = mkl_call(Val{:mkl_sparse_T_syprdI}(), typeof(A),
+                   transA, hA, B, dense_layout_B, ldB,
+                   alpha, beta, C, dense_layout_C, ldC)
+    destroy(hA)
+    check_status(res)
+    copytri && fastcopytri!(C, dense_layout_C == SPARSE_LAYOUT_COLUMN_MAJOR ? 'U' : 'L',
+                            T <: Complex)
+    return C
+end
+
+function syprd!(transA::Char, alpha::T, A::SparseMatrixCSC{T},
+                B::StridedMatrix{T}, beta::T, C::StridedMatrix{T};
+                kwargs...
+) where T
+    # since CSC support is implemented by transposing A, the A*A' has to be conjugated
+    # to be correct in the complex case, that produces incorrect results when beta != 0
+    (T <: Complex) && error("syprd!() wrapper does not support SparseMatrixCSC with complex values")
+
+    syprd!(
+        dual_opcode(T, transA), alpha,
+        convert(SparseMatrixCSR, transpose(A)),
+        B, beta, C; kwargs...
+    )
+end
+
 # find y: op(A) * y = alpha * x
 function trsv!(transA::Char, alpha::T, A::AbstractSparseMatrix{T}, descr::matrix_descr,
                x::StridedVector{T}, y::StridedVector{T}