diff --git a/resolve/hykkt/CMakeLists.txt b/resolve/hykkt/CMakeLists.txt
index 6a67a927a..7bd3c2b92 100644
--- a/resolve/hykkt/CMakeLists.txt
+++ b/resolve/hykkt/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(permutation)
 add_subdirectory(ruiz)
 add_subdirectory(cholesky)
+add_subdirectory(spgemm)
diff --git a/resolve/hykkt/README.md b/resolve/hykkt/README.md
new file mode 100644
index 000000000..00bfd5cfb
--- /dev/null
+++ b/resolve/hykkt/README.md
@@ -0,0 +1,46 @@
+# HyKKT
+
+## Description
+HyKKT (pronounced as _hiked_) is a package for solving systems of equations and unknowns resulting from an iterative solution of an optimization
+problem, for example optimal power flow, which uses hardware accelerators (GPUs) efficiently.
+
+The HyKKT package contains a linear solver tailored for Karush Kuhn Tucker (KKT) linear systems and
+deployment on hardware accelerator hardware such as GPUs. The solver requires
+all blocks of the $4\times 4$ block system: 
+
+```math
+\begin{bmatrix}
+    H + D_x     & 0         & J_c^T     & J_d^T \\
+      0         & D_s       & 0           & -I  \\
+     J_c        & 0         & 0           & 0   \\
+     J_d        & -I        & 0           & 0
+\end{bmatrix}
+\begin{bmatrix}
+  \Delta x \\ \Delta s \\ \Delta y_c \\ \Delta y_d
+\end{bmatrix} =
+\begin{bmatrix}
+  \tilde{r}_x \\ r_s \\ r_c \\ r_d
+\end{bmatrix}
+```
+
+separately and solves the system to a desired
+numerical precision exactly via block reduction and conjugate gradient on the
+schur complement. Please see the [HyKKT paper](https://www.tandfonline.com/doi/abs/10.1080/10556788.2022.2124990) for mathematical details.
+
+## Integration within ReSolve
+
+HyKKT code within ReSolve is not yet fully integrated. The code is currently experimental, but is actively in development.
+The main issue is the requirement that the user pass matrix and vector blocks, rather than the constructed KKT system. 
+This is currently not available within ReSolve as it is specific to HyKKT and KKT matrices.
+TODO:
+- Work with AMD to fix bug in Cholesky factorization.
+- Allow the representation of a matrix or vector by blocks within ReSolve.
+- Schur Compliment Conjugate Gradient within HyKKT.
+- Complete HyKKT integration within ReSolve.
+
+
+## Dependencies
+
+Most HyKKT dependencies are identical to ReSolve. Exceptions are:
+- HyKKT does not require KLU
+- For AMD builds, HyKKT requires HIP/ROCm >= 6.4 due to a known bug in SpGEMM.
diff --git a/resolve/hykkt/spgemm/CMakeLists.txt b/resolve/hykkt/spgemm/CMakeLists.txt
new file mode 100644
index 000000000..37cbe654d
--- /dev/null
+++ b/resolve/hykkt/spgemm/CMakeLists.txt
@@ -0,0 +1,56 @@
+#[[
+
+@brief Build SpGEMM module for HyKKT solver
+
+@author Adham Ibrahim <ibrahimas@ornl.gov>
+
+]]
+
+# Add source files for the SpGEMM module
+set(SPGEMM_SRC
+  SpGEMM.cpp
+  SpGEMMCpu.cpp
+)
+set(SPGEMM_HEADER_INSTALL
+  SpGEMM.hpp
+  SpGEMMImpl.hpp
+  SpGEMMCpu.hpp
+  SpGEMMCuda.hpp
+  SpGEMMHip.hpp
+)
+
+set(SPGEMM_CUDA_SRC
+  SpGEMMCuda.cpp
+)
+
+set(SPGEMM_ROCM_SRC
+  SpGEMMHip.cpp
+)
+
+# Build shared library ReSolve::hykkt
+add_library(resolve_hykkt_spgemm SHARED ${SPGEMM_SRC})
+
+target_link_libraries(resolve_hykkt_spgemm PRIVATE resolve_logger ${suitesparse_cholmod})
+target_include_directories(resolve_hykkt_spgemm PUBLIC ${SUITESPARSE_INCLUDE_DIR})
+
+if (RESOLVE_USE_CUDA)
+  target_sources(resolve_hykkt_spgemm PRIVATE ${SPGEMM_CUDA_SRC})
+  target_link_libraries(resolve_hykkt_spgemm PUBLIC resolve_backend_cuda)
+endif()
+
+if (RESOLVE_USE_HIP)
+  target_sources(resolve_hykkt_spgemm PRIVATE ${SPGEMM_ROCM_SRC})
+  target_link_libraries(resolve_hykkt_spgemm PUBLIC resolve_backend_hip)
+endif()
+
+# Link to dummy device backend if GPU support is not enabled
+if (NOT RESOLVE_USE_GPU)
+  target_link_libraries(resolve_hykkt_spgemm PUBLIC resolve_backend_cpu)
+endif()
+
+target_include_directories(resolve_hykkt_spgemm INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+install(FILES ${SPGEMM_HEADER_INSTALL} DESTINATION include/resolve/hykkt)
diff --git a/resolve/hykkt/spgemm/SpGEMM.cpp b/resolve/hykkt/spgemm/SpGEMM.cpp
new file mode 100644
index 000000000..1170f919f
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMM.cpp
@@ -0,0 +1,93 @@
+/**
+ * @file SpGEMM.cpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ */
+
+#include "SpGEMM.hpp"
+
+#include "SpGEMMCpu.hpp"
+#ifdef RESOLVE_USE_CUDA
+#include "SpGEMMCuda.hpp"
+#elif defined(RESOLVE_USE_HIP)
+#include "SpGEMMHip.hpp"
+#endif
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+  using out       = ReSolve::io::Logger;
+
+  namespace hykkt
+  {
+    /**
+     * Constructor for SpGEMM
+     * @param memspace[in] - Memory space for computation.
+     * @param alpha[in] - Scalar multiplier for the product.
+     * @param beta[in] - Scalar multiplier for the sum.
+     */
+    SpGEMM::SpGEMM(memory::MemorySpace memspace, real_type alpha, real_type beta)
+      : memspace_(memspace)
+    {
+      if (memspace_ == memory::HOST)
+      {
+        impl_ = new SpGEMMCpu(alpha, beta);
+      }
+      else
+      {
+#ifdef RESOLVE_USE_CUDA
+        impl_ = new SpGEMMCuda(alpha, beta);
+#elif defined(RESOLVE_USE_HIP)
+        impl_ = new SpGEMMHip(alpha, beta);
+#else
+        out::error() << "No GPU support enabled, and memory space set to DEVICE.\n";
+        exit(1);
+#endif
+      }
+    }
+
+    /**
+     * Destructor for SpGEMM
+     */
+    SpGEMM::~SpGEMM()
+    {
+      delete impl_;
+    }
+
+    /**
+     * Loads the two matrices for the product
+     * @param A[in] - Pointer to CSR matrix
+     * @param B[in] - Pointer to CSR matrix
+     */
+    void SpGEMM::loadProductMatrices(matrix::Csr* A, matrix::Csr* B)
+    {
+      impl_->loadProductMatrices(A, B);
+    }
+
+    /**
+     * Loads the sum matrix for the operation
+     * @param D[in] - Pointer to CSR matrix
+     */
+    void SpGEMM::loadSumMatrix(matrix::Csr* D)
+    {
+      impl_->loadSumMatrix(D);
+    }
+
+    /**
+     * Loads the result matrix
+     * @param E[in] - Pointer to pointer to CSR matrix
+     */
+    void SpGEMM::loadResultMatrix(matrix::Csr** E_ptr)
+    {
+      impl_->loadResultMatrix(E_ptr);
+    }
+
+    /**
+     * Computes the result of the SpGEMM operation
+     * E = alpha * A * B + beta * D
+     */
+    void SpGEMM::compute()
+    {
+      impl_->compute();
+    }
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMM.hpp b/resolve/hykkt/spgemm/SpGEMM.hpp
new file mode 100644
index 000000000..edc3a267a
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMM.hpp
@@ -0,0 +1,37 @@
+/**
+ * @file SpGEMM.hpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ */
+
+#pragma once
+#include "SpGEMMImpl.hpp"
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/Csr.hpp>
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    class SpGEMM
+    {
+    public:
+      // computes E = alpha * A * B + beta * D
+      SpGEMM(memory::MemorySpace memspace, real_type alpha, real_type beta);
+      ~SpGEMM();
+
+      void loadProductMatrices(matrix::Csr* A, matrix::Csr* B);
+      void loadSumMatrix(matrix::Csr* D);
+      void loadResultMatrix(matrix::Csr** E_ptr);
+
+      void compute();
+
+    private:
+      memory::MemorySpace memspace_;
+
+      SpGEMMImpl* impl_;
+    };
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMCpu.cpp b/resolve/hykkt/spgemm/SpGEMMCpu.cpp
new file mode 100644
index 000000000..8310b80a4
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMCpu.cpp
@@ -0,0 +1,139 @@
+/**
+ * @file SpGEMMCpu.cpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Implementation of SpGEMM using CHOLMOD for CPU
+ */
+
+#include "SpGEMMCpu.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    /**
+     * Constructor for SpGEMMCpu
+     * @param alpha[in] - Scalar multiplier for the product.
+     * @param beta[in] - Scalar multiplier for the sum.
+     */
+    SpGEMMCpu::SpGEMMCpu(real_type alpha, real_type beta)
+      : alpha_(alpha), beta_(beta)
+    {
+      cholmod_start(&Common_);
+
+      A_ = nullptr;
+      B_ = nullptr;
+      D_ = nullptr;
+    }
+
+    /**
+     * Destructor for SpGEMMCpu
+     */
+    SpGEMMCpu::~SpGEMMCpu()
+    {
+      if (A_)
+      {
+        cholmod_free_sparse(&A_, &Common_);
+      }
+      if (B_)
+      {
+        cholmod_free_sparse(&B_, &Common_);
+      }
+      if (D_)
+      {
+        cholmod_free_sparse(&D_, &Common_);
+      }
+      cholmod_finish(&Common_);
+    }
+
+    void SpGEMMCpu::loadProductMatrices(matrix::Csr* A, matrix::Csr* B)
+    {
+      if (!A_)
+      {
+        A_ = allocateCholmodType(A);
+      }
+      if (!B_)
+      {
+        B_ = allocateCholmodType(B);
+      }
+      copyDataToCholmodType(A, A_);
+      copyDataToCholmodType(B, B_);
+    }
+
+    void SpGEMMCpu::loadSumMatrix(matrix::Csr* D)
+    {
+      if (!D_)
+      {
+        D_ = allocateCholmodType(D);
+      }
+      copyDataToCholmodType(D, D_);
+    }
+
+    void SpGEMMCpu::loadResultMatrix(matrix::Csr** E_ptr)
+    {
+      E_ptr_ = E_ptr;
+    }
+
+    /**
+     * @brief Computes the result of the SpGEMM operation
+     *
+     * Does not store partial results for reuse, as is this is not supported
+     * by the CHOLMOD package.
+     */
+    void SpGEMMCpu::compute()
+    {
+      cholmod_sparse* C_chol = cholmod_ssmult(B_, A_, 0, 1, 0, &Common_);
+      cholmod_sparse* E_chol = cholmod_add(C_chol, D_, &alpha_, &beta_, 1, 0, &Common_);
+
+      if (!(*E_ptr_))
+      {
+        *E_ptr_ = new matrix::Csr((index_type) E_chol->nrow, (index_type) E_chol->ncol, (index_type) E_chol->nzmax);
+      }
+      else
+      {
+        (*E_ptr_)->destroyMatrixData(memory::HOST);
+      }
+
+      // Previous data must be de-allocated and new data copied
+      // Cholmod does not allow for reuse of arrays
+      (*E_ptr_)->copyDataFrom(static_cast<index_type*>(E_chol->p),
+                              static_cast<index_type*>(E_chol->i),
+                              static_cast<real_type*>(E_chol->x),
+                              memory::HOST,
+                              memory::HOST);
+    }
+
+    /**
+     * @brief Allocates a CHOLMOD sparse matrix of the same size as the input CSR matrix
+     * @param A[in] - Pointer to CSR matrix
+     * @return Pointer to the allocated CHOLMOD sparse matrix
+     */
+    cholmod_sparse* SpGEMMCpu::allocateCholmodType(matrix::Csr* A)
+    {
+      return cholmod_allocate_sparse((size_t) A->getNumRows(),
+                                     (size_t) A->getNumColumns(),
+                                     (size_t) A->getNnz(),
+                                     1,
+                                     1,
+                                     0,
+                                     CHOLMOD_REAL,
+                                     &Common_);
+    }
+
+    /**
+     * Copies the data from a CSR matrix to a CHOLMOD sparse matrix
+     * @param A[in] - Pointer to CSR matrix
+     * @param A_chol[in] - Pointer to CHOLMOD sparse matrix
+     */
+    void SpGEMMCpu::copyDataToCholmodType(matrix::Csr* A, cholmod_sparse* A_chol)
+    {
+      mem_.copyArrayHostToHost(
+          static_cast<int*>(A_chol->p), A->getRowData(memory::HOST), A->getNumRows() + 1);
+      mem_.copyArrayHostToHost(
+          static_cast<int*>(A_chol->i), A->getColData(memory::HOST), A->getNnz());
+      mem_.copyArrayHostToHost(
+          static_cast<double*>(A_chol->x), A->getValues(memory::HOST), A->getNnz());
+    }
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMCpu.hpp b/resolve/hykkt/spgemm/SpGEMMCpu.hpp
new file mode 100644
index 000000000..3db705546
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMCpu.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cholmod.h>
+
+#include "SpGEMMImpl.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    class SpGEMMCpu : public SpGEMMImpl
+    {
+    public:
+      SpGEMMCpu(real_type alpha, real_type beta);
+      ~SpGEMMCpu();
+
+      void loadProductMatrices(matrix::Csr* A, matrix::Csr* B);
+      void loadSumMatrix(matrix::Csr* D);
+      void loadResultMatrix(matrix::Csr** E_ptr);
+
+      void compute();
+
+    private:
+      MemoryHandler mem_;
+
+      real_type alpha_;
+      real_type beta_;
+
+      cholmod_common Common_;
+
+      cholmod_sparse* A_;
+      cholmod_sparse* B_;
+      cholmod_sparse* D_;
+
+      matrix::Csr** E_ptr_;
+
+      cholmod_sparse* allocateCholmodType(matrix::Csr* X);
+      void            copyDataToCholmodType(matrix::Csr* X, cholmod_sparse* X_chol);
+    };
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMCuda.cpp b/resolve/hykkt/spgemm/SpGEMMCuda.cpp
new file mode 100644
index 000000000..e1283a20a
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMCuda.cpp
@@ -0,0 +1,301 @@
+/**
+ * @file SpGEMMCuda.cpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Implementation of SpGEMM using cuSPARSE for GPU
+ */
+
+#include "SpGEMMCuda.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+  using out       = ReSolve::io::Logger;
+
+  namespace hykkt
+  {
+    SpGEMMCuda::SpGEMMCuda(real_type alpha, real_type beta)
+      : alpha_(alpha), beta_(beta)
+    {
+      cusparseCreate(&handle_);
+      cusparseSpGEMM_createDescr(&spgemm_desc_);
+      cusparseCreateMatDescr(&D_descr_);
+      cusparseSetMatType(D_descr_, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(D_descr_, CUSPARSE_INDEX_BASE_ZERO);
+    }
+
+    SpGEMMCuda::~SpGEMMCuda()
+    {
+      cusparseSpGEMM_destroyDescr(spgemm_desc_);
+      cusparseDestroy(handle_);
+    }
+
+    void SpGEMMCuda::loadProductMatrices(matrix::Csr* A, matrix::Csr* B)
+    {
+      A_descr_ = convertToCusparseType(A);
+      B_descr_ = convertToCusparseType(B);
+
+      n_ = A->getNumRows();
+
+      if (!C_descr_)
+      {
+        mem_.allocateArrayOnDevice(&C_row_ptr_, (index_type) n_ + 1);
+
+        cusparseCreateCsr(&C_descr_,
+                          n_,
+                          B->getNumColumns(),
+                          0, // nnz will be determined later
+                          C_row_ptr_,
+                          nullptr,
+                          nullptr,
+                          CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO,
+                          CUDA_R_64F);
+      }
+    }
+
+    void SpGEMMCuda::loadSumMatrix(matrix::Csr* D)
+    {
+      D_ = D;
+    }
+
+    void SpGEMMCuda::loadResultMatrix(matrix::Csr** E_ptr)
+    {
+      E_ptr_ = E_ptr;
+    }
+
+    /**
+     * @brief Computes the result of the SpGEMM operation
+     * This uses both `cusparseSpGEMMreuse` and `cusparseXcsrgeam` to perform
+     * the product and sum in separate stages. This is required because
+     * cusparseSpGEMM does not handle the case where D has a different sparsity
+     * pattern than the product A * B
+     */
+    void SpGEMMCuda::compute()
+    {
+      double beta_product = 0.0;
+      double alpha_sum    = 1.0;
+
+      if (!(*E_ptr_))
+      {
+        size_t temp_buffer_size_1 = 0;
+        size_t temp_buffer_size_2 = 0;
+        size_t temp_buffer_size_3 = 0;
+
+        void* temp_buffer_1 = nullptr;
+        void* temp_buffer_2 = nullptr;
+        void* temp_buffer_3 = nullptr;
+
+        cusparseSpGEMMreuse_workEstimation(handle_,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           A_descr_,
+                                           B_descr_,
+                                           C_descr_,
+                                           CUSPARSE_SPGEMM_DEFAULT,
+                                           spgemm_desc_,
+                                           &temp_buffer_size_1,
+                                           nullptr);
+
+        mem_.allocateBufferOnDevice(&temp_buffer_1, temp_buffer_size_1);
+
+        cusparseSpGEMMreuse_workEstimation(handle_,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           A_descr_,
+                                           B_descr_,
+                                           C_descr_,
+                                           CUSPARSE_SPGEMM_DEFAULT,
+                                           spgemm_desc_,
+                                           &temp_buffer_size_1,
+                                           temp_buffer_1);
+
+        cusparseSpGEMMreuse_nnz(handle_,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                A_descr_,
+                                B_descr_,
+                                C_descr_,
+                                CUSPARSE_SPGEMM_DEFAULT,
+                                spgemm_desc_,
+                                &temp_buffer_size_2,
+                                nullptr,
+                                &temp_buffer_size_3,
+                                nullptr,
+                                &buffer_size_4_,
+                                nullptr);
+
+        mem_.allocateBufferOnDevice(&temp_buffer_2, temp_buffer_size_2);
+        mem_.allocateBufferOnDevice(&temp_buffer_3, temp_buffer_size_3);
+        mem_.allocateBufferOnDevice(&buffer_4_, buffer_size_4_);
+
+        cusparseSpGEMMreuse_nnz(handle_,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                A_descr_,
+                                B_descr_,
+                                C_descr_,
+                                CUSPARSE_SPGEMM_DEFAULT,
+                                spgemm_desc_,
+                                &temp_buffer_size_2,
+                                temp_buffer_2,
+                                &temp_buffer_size_3,
+                                temp_buffer_3,
+                                &buffer_size_4_,
+                                buffer_4_);
+
+        mem_.deleteOnDevice(temp_buffer_1);
+        mem_.deleteOnDevice(temp_buffer_2);
+
+        int64_t C_num_cols = 0;
+        int64_t C_nnz_     = 0;
+        cusparseSpMatGetSize(C_descr_, &n_, &C_num_cols, &C_nnz_);
+
+        mem_.allocateArrayOnDevice(&C_col_ind_, (index_type) C_nnz_);
+        mem_.allocateArrayOnDevice(&C_val_, (index_type) C_nnz_);
+
+        cusparseCsrSetPointers(C_descr_,
+                               C_row_ptr_,
+                               C_col_ind_,
+                               C_val_);
+
+        cusparseSpGEMMreuse_copy(handle_,
+                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 A_descr_,
+                                 B_descr_,
+                                 C_descr_,
+                                 CUSPARSE_SPGEMM_DEFAULT,
+                                 spgemm_desc_,
+                                 &buffer_size_5_,
+                                 nullptr);
+
+        mem_.allocateBufferOnDevice(&buffer_5_, buffer_size_5_);
+
+        cusparseSpGEMMreuse_copy(handle_,
+                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 A_descr_,
+                                 B_descr_,
+                                 C_descr_,
+                                 CUSPARSE_SPGEMM_DEFAULT,
+                                 spgemm_desc_,
+                                 &buffer_size_5_,
+                                 buffer_5_);
+
+        mem_.deleteOnDevice(temp_buffer_3);
+      }
+
+      cusparseSpGEMMreuse_compute(handle_,
+                                  CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                  CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                  &alpha_,
+                                  A_descr_,
+                                  B_descr_,
+                                  &beta_product,
+                                  C_descr_,
+                                  CUDA_R_64F,
+                                  CUSPARSE_SPGEMM_DEFAULT,
+                                  spgemm_desc_);
+
+      if (!(*E_ptr_))
+      {
+        // Begin set up for addition
+        mem_.allocateArrayOnDevice(&E_row_ptr_, (index_type) n_ + 1);
+        cusparseSetPointerMode(handle_, CUSPARSE_POINTER_MODE_HOST);
+
+        // pass in D_descr_ as a dummy variable
+        cusparseDcsrgeam2_bufferSizeExt(handle_,
+                                        D_->getNumRows(),
+                                        D_->getNumColumns(),
+                                        &alpha_sum,
+                                        D_descr_,
+                                        (int) C_nnz_,
+                                        C_val_,
+                                        C_row_ptr_,
+                                        C_col_ind_,
+                                        &beta_,
+                                        D_descr_,
+                                        D_->getNnz(),
+                                        D_->getValues(memory::DEVICE),
+                                        D_->getRowData(memory::DEVICE),
+                                        D_->getColData(memory::DEVICE),
+                                        D_descr_,
+                                        E_val_,
+                                        E_row_ptr_,
+                                        E_col_ind_,
+                                        &buffer_add_size_);
+
+        mem_.allocateBufferOnDevice(&buffer_add_, buffer_add_size_);
+
+        cusparseXcsrgeam2Nnz(handle_,
+                             D_->getNumRows(),
+                             D_->getNumColumns(),
+                             D_descr_,
+                             (int) C_nnz_,
+                             C_row_ptr_,
+                             C_col_ind_,
+                             D_descr_,
+                             D_->getNnz(),
+                             D_->getRowData(memory::DEVICE),
+                             D_->getColData(memory::DEVICE),
+                             D_descr_,
+                             E_row_ptr_,
+                             &E_nnz_,
+                             buffer_add_);
+
+        mem_.allocateArrayOnDevice(&E_col_ind_, (index_type) E_nnz_);
+        mem_.allocateArrayOnDevice(&E_val_, (index_type) E_nnz_);
+
+        (*E_ptr_) = new matrix::Csr((index_type) n_, D_->getNumColumns(), (index_type) E_nnz_);
+        (*E_ptr_)->setDataPointers(E_row_ptr_, E_col_ind_, E_val_, memory::DEVICE);
+      }
+
+      cusparseDcsrgeam2(handle_,
+                        D_->getNumRows(),
+                        D_->getNumColumns(),
+                        &alpha_sum,
+                        D_descr_,
+                        (int) C_nnz_,
+                        C_val_,
+                        C_row_ptr_,
+                        C_col_ind_,
+                        &beta_,
+                        D_descr_,
+                        D_->getNnz(),
+                        D_->getValues(memory::DEVICE),
+                        D_->getRowData(memory::DEVICE),
+                        D_->getColData(memory::DEVICE),
+                        D_descr_,
+                        E_val_,
+                        E_row_ptr_,
+                        E_col_ind_,
+                        buffer_add_);
+
+      (*E_ptr_)->setUpdated(memory::DEVICE);
+    }
+
+    /**
+     * @brief Converts a CSR matrix to a cuSPARSE sparse matrix descriptor
+     * @param A[in] - Pointer to CSR matrix
+     * @return cuSPARSE sparse matrix descriptor
+     */
+    cusparseSpMatDescr_t SpGEMMCuda::convertToCusparseType(matrix::Csr* A)
+    {
+      cusparseSpMatDescr_t descr;
+      // TODO: this hardcodes the types but should be based on ReSolve types
+      cusparseCreateCsr(&descr,
+                        A->getNumRows(),
+                        A->getNumColumns(),
+                        A->getNnz(),
+                        A->getRowData(memory::DEVICE),
+                        A->getColData(memory::DEVICE),
+                        A->getValues(memory::DEVICE),
+                        CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_32I,
+                        CUSPARSE_INDEX_BASE_ZERO,
+                        CUDA_R_64F);
+      return descr;
+    }
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMCuda.hpp b/resolve/hykkt/spgemm/SpGEMMCuda.hpp
new file mode 100644
index 000000000..8544a39c9
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMCuda.hpp
@@ -0,0 +1,73 @@
+/**
+ * @file SpGEMMCuda.hpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Header file for SpGEMM using cuSPARSE for GPU
+ */
+
+#pragma once
+
+#include <cusparse.h>
+
+#include "SpGEMMImpl.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    class SpGEMMCuda : public SpGEMMImpl
+    {
+    public:
+      SpGEMMCuda(real_type alpha, real_type beta);
+      ~SpGEMMCuda();
+
+      void loadProductMatrices(matrix::Csr* A, matrix::Csr* B);
+      void loadSumMatrix(matrix::Csr* D);
+      void loadResultMatrix(matrix::Csr** E_ptr);
+
+      void compute();
+
+    private:
+      MemoryHandler mem_;
+
+      real_type alpha_;
+      real_type beta_;
+
+      matrix::Csr*  D_     = nullptr;
+      matrix::Csr** E_ptr_ = nullptr;
+
+      cusparseHandle_t      handle_      = nullptr;
+      cusparseSpGEMMDescr_t spgemm_desc_ = nullptr;
+
+      size_t buffer_size_4_ = 0;
+      size_t buffer_size_5_ = 0;
+      void*  buffer_4_      = nullptr;
+      void*  buffer_5_      = nullptr;
+
+      size_t buffer_add_size_ = 0;
+      void*  buffer_add_      = nullptr;
+
+      cusparseSpMatDescr_t A_descr_ = nullptr;
+      cusparseSpMatDescr_t B_descr_ = nullptr;
+      cusparseSpMatDescr_t C_descr_ = nullptr;
+
+      cusparseMatDescr_t D_descr_ = nullptr;
+
+      int64_t n_     = 0;
+      int64_t C_nnz_ = 0;
+
+      index_type* C_row_ptr_ = nullptr;
+      index_type* C_col_ind_ = nullptr;
+      real_type*  C_val_     = nullptr;
+
+      int E_nnz_ = 0;
+
+      index_type* E_row_ptr_ = nullptr;
+      index_type* E_col_ind_ = nullptr;
+      real_type*  E_val_     = nullptr;
+
+      cusparseSpMatDescr_t convertToCusparseType(matrix::Csr* A);
+    };
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMHip.cpp b/resolve/hykkt/spgemm/SpGEMMHip.cpp
new file mode 100644
index 000000000..8350d43b2
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMHip.cpp
@@ -0,0 +1,208 @@
+/**
+ * @file SpGEMMHip.cpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Implementation of SpGEMM using rocSPARSE for GPU
+ */
+
+#include "SpGEMMHip.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+  using out       = ReSolve::io::Logger;
+
+  namespace hykkt
+  {
+    SpGEMMHip::SpGEMMHip(real_type alpha, real_type beta)
+      : alpha_(alpha), beta_(beta)
+    {
+      rocsparse_create_handle(&handle_);
+    }
+
+    SpGEMMHip::~SpGEMMHip()
+    {
+      rocsparse_destroy_spmat_descr(A_descr_);
+      rocsparse_destroy_spmat_descr(B_descr_);
+      rocsparse_destroy_spmat_descr(D_descr_);
+      rocsparse_destroy_spmat_descr(E_descr_);
+      rocsparse_destroy_handle(handle_);
+
+      mem_.deleteOnDevice(buffer_);
+    }
+
+    void SpGEMMHip::loadProductMatrices(matrix::Csr* A, matrix::Csr* B)
+    {
+      if (A_descr_)
+      {
+        rocsparse_destroy_spmat_descr(A_descr_);
+      }
+      A_descr_ = convertToRocsparseType(A);
+
+      if (B_descr_)
+      {
+        rocsparse_destroy_spmat_descr(B_descr_);
+      }
+      B_descr_ = convertToRocsparseType(B);
+
+      E_num_rows_ = A->getNumRows();
+      E_num_cols_ = B->getNumColumns();
+    }
+
+    void SpGEMMHip::loadSumMatrix(matrix::Csr* D)
+    {
+      if (D_descr_)
+      {
+        rocsparse_destroy_spmat_descr(D_descr_);
+      }
+      D_descr_ = convertToRocsparseType(D);
+    }
+
+    void SpGEMMHip::loadResultMatrix(matrix::Csr** E_ptr)
+    {
+      if (!E_ptr_)
+      {
+        mem_.allocateArrayOnDevice(&E_row_ptr_, (index_type) E_num_rows_ + 1);
+        rocsparse_create_csr_descr(&E_descr_,
+                                   E_num_rows_,
+                                   E_num_cols_,
+                                   E_nnz_,
+                                   E_row_ptr_,
+                                   nullptr,
+                                   nullptr,
+                                   rocsparse_indextype_i32,
+                                   rocsparse_indextype_i32,
+                                   rocsparse_index_base_zero,
+                                   rocsparse_datatype_f64_r);
+      }
+      E_ptr_ = E_ptr;
+    }
+
+    /**
+     * @brief Computes the result of the SpGEMM operation
+     * This uses `rocsparse_spgemm` with the symbolic and numeric stages
+     * separated to allow for efficient reuse.
+     */
+    void SpGEMMHip::compute()
+    {
+      rocsparse_status status;
+      if (!buffer_) // first computation
+      {
+        // Determine buffer size and allocate
+        status = rocsparse_spgemm(handle_,
+                                  rocsparse_operation_none,
+                                  rocsparse_operation_none,
+                                  &alpha_,
+                                  A_descr_,
+                                  B_descr_,
+                                  &beta_,
+                                  D_descr_,
+                                  E_descr_,
+                                  rocsparse_datatype_f64_r,
+                                  rocsparse_spgemm_alg_default,
+                                  rocsparse_spgemm_stage_buffer_size,
+                                  &buffer_size_,
+                                  nullptr);
+        if (status != rocsparse_status_success)
+        {
+          out::error() << "Failed to determine buffer size. Status: " << status << "\n";
+        }
+        mem_.allocateBufferOnDevice(&buffer_, buffer_size_);
+
+        // Determine number of nonzeros in result
+        status = rocsparse_spgemm(handle_,
+                                  rocsparse_operation_none,
+                                  rocsparse_operation_none,
+                                  &alpha_,
+                                  A_descr_,
+                                  B_descr_,
+                                  &beta_,
+                                  D_descr_,
+                                  E_descr_,
+                                  rocsparse_datatype_f64_r,
+                                  rocsparse_spgemm_alg_default,
+                                  rocsparse_spgemm_stage_nnz,
+                                  &buffer_size_,
+                                  buffer_);
+        if (status != rocsparse_status_success)
+        {
+          out::error() << "Failed to determine number of nonzeros. Status: " << status << "\n";
+        }
+
+        rocsparse_spmat_get_size(E_descr_, &E_num_rows_, &E_num_cols_, &E_nnz_);
+
+        mem_.allocateArrayOnDevice(&E_col_ind_, (index_type) E_nnz_);
+        mem_.allocateArrayOnDevice(&E_val_, (index_type) E_nnz_);
+
+        rocsparse_csr_set_pointers(E_descr_, E_row_ptr_, E_col_ind_, E_val_);
+
+        *E_ptr_ = new matrix::Csr((index_type) E_num_rows_, (index_type) E_num_cols_, (index_type) E_nnz_);
+        (*E_ptr_)->setDataPointers(E_row_ptr_, E_col_ind_, E_val_, memory::DEVICE);
+
+        // Fill the column indices of the result, the values will be computed next
+        status = rocsparse_spgemm(handle_,
+                                  rocsparse_operation_none,
+                                  rocsparse_operation_none,
+                                  &alpha_,
+                                  A_descr_,
+                                  B_descr_,
+                                  &beta_,
+                                  D_descr_,
+                                  E_descr_,
+                                  rocsparse_datatype_f64_r,
+                                  rocsparse_spgemm_alg_default,
+                                  rocsparse_spgemm_stage_symbolic,
+                                  &buffer_size_,
+                                  buffer_);
+        if (status != rocsparse_status_success)
+        {
+          out::error() << "Failed to perform symbolic stage. Status: " << status << "\n";
+        }
+      }
+
+      // SpGEMM numeric computation
+      status = rocsparse_spgemm(handle_,
+                                rocsparse_operation_none,
+                                rocsparse_operation_none,
+                                &alpha_,
+                                A_descr_,
+                                B_descr_,
+                                &beta_,
+                                D_descr_,
+                                E_descr_,
+                                rocsparse_datatype_f64_r,
+                                rocsparse_spgemm_alg_default,
+                                rocsparse_spgemm_stage_numeric,
+                                &buffer_size_,
+                                buffer_);
+      if (status != rocsparse_status_success)
+      {
+        out::error() << "Failed to perform numeric stage. Status: " << status << "\n";
+      }
+
+      (*E_ptr_)->setUpdated(memory::DEVICE);
+    }
+
+    /**
+     * @brief Converts a CSR matrix to a rocSPARSE sparse matrix descriptor
+     * @param A[in] - Pointer to CSR matrix
+     * @return rocSPARSE sparse matrix descriptor
+     */
+    rocsparse_spmat_descr SpGEMMHip::convertToRocsparseType(matrix::Csr* A)
+    {
+      rocsparse_spmat_descr descr;
+      // TODO: this hardcodes the types but should be based on ReSolve types
+      rocsparse_create_csr_descr(&descr,
+                                 A->getNumRows(),
+                                 A->getNumColumns(),
+                                 A->getNnz(),
+                                 A->getRowData(memory::DEVICE),
+                                 A->getColData(memory::DEVICE),
+                                 A->getValues(memory::DEVICE),
+                                 rocsparse_indextype_i32,
+                                 rocsparse_indextype_i32,
+                                 rocsparse_index_base_zero,
+                                 rocsparse_datatype_f64_r);
+      return descr;
+    }
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMHip.hpp b/resolve/hykkt/spgemm/SpGEMMHip.hpp
new file mode 100644
index 000000000..00627cd32
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMHip.hpp
@@ -0,0 +1,60 @@
+/**
+ * @file SpGEMMHip.hpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Header file for SpGEMM using rocSPARSE for GPU
+ */
+
+#pragma once
+
+#include <rocsparse/rocsparse.h>
+
+#include "SpGEMMImpl.hpp"
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    class SpGEMMHip : public SpGEMMImpl
+    {
+    public:
+      SpGEMMHip(real_type alpha, real_type beta);
+      ~SpGEMMHip();
+
+      void loadProductMatrices(matrix::Csr* A, matrix::Csr* B);
+      void loadSumMatrix(matrix::Csr* D);
+      void loadResultMatrix(matrix::Csr** E_ptr);
+
+      void compute();
+
+    private:
+      MemoryHandler mem_;
+
+      real_type alpha_;
+      real_type beta_;
+
+      rocsparse_handle      handle_  = nullptr;
+      rocsparse_spmat_descr A_descr_ = nullptr;
+      rocsparse_spmat_descr B_descr_ = nullptr;
+      rocsparse_spmat_descr D_descr_ = nullptr;
+
+      long E_num_rows_ = 0;
+      long E_num_cols_ = 0;
+      long E_nnz_      = 0;
+
+      index_type* E_row_ptr_ = nullptr;
+      index_type* E_col_ind_ = nullptr;
+      real_type*  E_val_     = nullptr;
+
+      rocsparse_spmat_descr E_descr_ = nullptr;
+
+      matrix::Csr** E_ptr_ = nullptr;
+
+      size_t buffer_size_ = 0;
+      void*  buffer_      = nullptr;
+
+      rocsparse_spmat_descr convertToRocsparseType(matrix::Csr* A);
+    };
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/resolve/hykkt/spgemm/SpGEMMImpl.hpp b/resolve/hykkt/spgemm/SpGEMMImpl.hpp
new file mode 100644
index 000000000..ee8575dbf
--- /dev/null
+++ b/resolve/hykkt/spgemm/SpGEMMImpl.hpp
@@ -0,0 +1,33 @@
+/**
+ * @file SpGEMMImpl.hpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Interface for SpGEMM implementations
+ */
+
+#pragma once
+
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/utilities/logger/Logger.hpp>
+
+namespace ReSolve
+{
+  using real_type = ReSolve::real_type;
+
+  namespace hykkt
+  {
+    class SpGEMMImpl
+    {
+    public:
+      SpGEMMImpl()          = default;
+      virtual ~SpGEMMImpl() = default;
+
+      virtual void loadProductMatrices(matrix::Csr* A, matrix::Csr* B) = 0;
+      virtual void loadSumMatrix(matrix::Csr* D)                       = 0;
+      virtual void loadResultMatrix(matrix::Csr** E_ptr)               = 0;
+
+      virtual void compute() = 0;
+    };
+  } // namespace hykkt
+} // namespace ReSolve
diff --git a/tests/unit/hykkt/CMakeLists.txt b/tests/unit/hykkt/CMakeLists.txt
index 34296b2b3..161c8f6fa 100644
--- a/tests/unit/hykkt/CMakeLists.txt
+++ b/tests/unit/hykkt/CMakeLists.txt
@@ -9,11 +9,11 @@
 # Build HyKKT tests
 
 add_executable(runHykktPermutationTests.exe runHykktPermutationTests.cpp)
-
 add_executable(runHykktRuizScalingTests.exe runHykktRuizScalingTests.cpp)
+add_executable(runHykktSpGEMMTests.exe runHykktSpGEMMTests.cpp)
 target_link_libraries(runHykktPermutationTests.exe PRIVATE resolve_hykkt resolve_matrix)
 target_link_libraries(runHykktRuizScalingTests.exe PRIVATE resolve_hykkt resolve_hykkt_ruiz resolve_matrix)
-
+target_link_libraries(runHykktSpGEMMTests.exe PRIVATE resolve_hykkt resolve_hykkt_spgemm resolve_matrix)
 
 add_executable(runHykktCholeskyTests.exe runHykktCholeskyTests.cpp)
 target_link_libraries(runHykktCholeskyTests.exe PRIVATE resolve_hykkt_chol resolve_matrix resolve_vector)
@@ -21,7 +21,8 @@ target_link_libraries(runHykktCholeskyTests.exe PRIVATE resolve_hykkt_chol resol
 # Install tests
 set(installable_tests runHykktPermutationTests.exe
                       runHykktRuizScalingTests.exe
-                      runHykktCholeskyTests.exe)
+                      runHykktCholeskyTests.exe
+                      runHykktSpGEMMTests.exe)
 
 install(TARGETS ${installable_tests}
         RUNTIME DESTINATION bin/resolve/tests/unit)
@@ -29,3 +30,4 @@ install(TARGETS ${installable_tests}
 add_test(NAME hykkt_perm_test   COMMAND $<TARGET_FILE:runHykktPermutationTests.exe>)
 add_test(NAME hykkt_ruiz_test   COMMAND $<TARGET_FILE:runHykktRuizScalingTests.exe>)
 add_test(NAME hykkt_chol_test   COMMAND $<TARGET_FILE:runHykktCholeskyTests.exe>)
+add_test(NAME hykkt_spgemm_test COMMAND $<TARGET_FILE:runHykktSpGEMMTests.exe>)
diff --git a/tests/unit/hykkt/HykktSpGEMMTests.hpp b/tests/unit/hykkt/HykktSpGEMMTests.hpp
new file mode 100644
index 000000000..d10b0c368
--- /dev/null
+++ b/tests/unit/hykkt/HykktSpGEMMTests.hpp
@@ -0,0 +1,414 @@
+
+#pragma once
+
+#include <algorithm>
+#include <cholmod.h>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <resolve/hykkt/spgemm/SpGEMM.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <tests/unit/TestBase.hpp>
+
+namespace ReSolve
+{
+  namespace tests
+  {
+    /**
+     * @brief Tests for class hykkt::SpGEMM
+     *
+     */
+    class HykktSpGEMMTests : public TestBase
+    {
+    public:
+      HykktSpGEMMTests(memory::MemorySpace memspace)
+        : memspace_(memspace)
+      {
+      }
+
+      virtual ~HykktSpGEMMTests()
+      {
+      }
+
+      /**
+       * @brief Test the solver on a minimal example
+       *
+       * @return TestOutcome the outcome of the test
+       */
+      TestOutcome minimalCorrectness()
+      {
+        TestStatus  status;
+        std::string testname(__func__);
+
+        hykkt::SpGEMM spgemm(memspace_, 1.0, 1.0);
+
+        matrix::Csr* A = nullptr;
+        matrix::Csr* B = nullptr;
+        matrix::Csr* D = nullptr;
+
+        generateExampleData(&A, &B, &D);
+
+        matrix::Csr* E = nullptr;
+
+        spgemm.loadProductMatrices(A, B);
+        spgemm.loadSumMatrix(D);
+        spgemm.loadResultMatrix(&E);
+        spgemm.compute();
+
+        if (memspace_ == memory::DEVICE)
+        {
+          E->syncData(memory::HOST);
+        }
+
+        status *= verifyResult(E, 1.0);
+
+        delete A;
+        delete B;
+        delete D;
+        delete E;
+
+        return status.report(testname.c_str());
+      }
+
+      /**
+       * @brief Generate an n by n example with known solution
+       *
+       * Generates lower bidiagonal A, upper bidiagonal B, and D with
+       * ones on upper diagonal.
+       *
+       * @param[in] n - The size of the test
+       */
+      TestOutcome symbolic(index_type n)
+      {
+        TestStatus  status;
+        std::string testname(__func__);
+        testname += ", n = " + std::to_string(n);
+
+        hykkt::SpGEMM spgemm(memspace_, 1.0, 1.0);
+
+        index_type nnz = 2 * n - 1;
+
+        index_type* A_row_ptr = new index_type[n + 1];
+        index_type* A_col_ind = new index_type[nnz];
+        real_type*  A_values  = new real_type[nnz];
+
+        index_type* B_row_ptr = new index_type[n + 1];
+        index_type* B_col_ind = new index_type[nnz];
+        real_type*  B_values  = new real_type[nnz];
+
+        index_type* D_row_ptr = new index_type[n + 1];
+        index_type* D_col_ind = new index_type[n - 2];
+        real_type*  D_values  = new real_type[n - 2];
+
+        A_row_ptr[0] = 0;
+        A_row_ptr[1] = 1;
+        A_col_ind[0] = 0;
+        A_values[0]  = 1.0;
+
+        // A = [1      ]
+        //     [2 1    ]
+        //     [  3 1  ]
+        //     [  ...  ]
+        for (index_type i = 1; i < n; i++)
+        {
+          A_col_ind[2 * i - 1] = i - 1;
+          A_values[2 * i - 1]  = i + 1;
+
+          A_col_ind[2 * i] = i;
+          A_values[2 * i]  = 1.0;
+
+          A_row_ptr[i + 1] = 2 + A_row_ptr[i];
+        }
+
+        // B = [1 2    ]
+        //     [  1 3  ]
+        //     [    1  ]
+        //     [  ...  ]
+        B_row_ptr[0] = 0;
+        for (index_type i = 0; i < n; i++)
+        {
+          B_col_ind[2 * i] = i;
+          B_values[2 * i]  = 1.0;
+
+          if (i < n - 1)
+          {
+            B_col_ind[2 * i + 1] = i + 1;
+            B_values[2 * i + 1]  = i + 2;
+            B_row_ptr[i + 1]     = 2 + B_row_ptr[i];
+          }
+          else
+          {
+            B_row_ptr[i + 1] = 1 + B_row_ptr[i];
+          }
+        }
+
+        // D = [0 0 1    ]
+        //     [  0 0 1  ]
+        //     [    0 0 1]
+        //     [   ...   ]
+        D_row_ptr[0] = 0;
+        for (index_type i = 0; i < n - 2; i++)
+        {
+          D_col_ind[i]     = i + 2;
+          D_values[i]      = 1.0;
+          D_row_ptr[i + 1] = 1 + D_row_ptr[i];
+        }
+        D_row_ptr[n - 1] = D_row_ptr[n - 2];
+        D_row_ptr[n]     = D_row_ptr[n - 1];
+
+        matrix::Csr* A = new matrix::Csr(n, n, nnz);
+        matrix::Csr* B = new matrix::Csr(n, n, nnz);
+        matrix::Csr* D = new matrix::Csr(n, n, n - 2);
+
+        A->copyDataFrom(A_row_ptr, A_col_ind, A_values, memory::HOST, memspace_);
+        B->copyDataFrom(B_row_ptr, B_col_ind, B_values, memory::HOST, memspace_);
+        D->copyDataFrom(D_row_ptr, D_col_ind, D_values, memory::HOST, memspace_);
+
+        matrix::Csr* E = nullptr;
+
+        spgemm.loadProductMatrices(A, B);
+        spgemm.loadSumMatrix(D);
+        spgemm.loadResultMatrix(&E);
+        spgemm.compute();
+
+        if (memspace_ == memory::DEVICE)
+        {
+          E->syncData(memory::HOST);
+        }
+
+        double tol = 1e-6;
+        if (fabs(E->getValues(memory::HOST)[0] - 1.0) > tol)
+        {
+          std::cerr << "Test failed: E[0][0] = " << E->getValues(memory::HOST)[0] << ", expected: 1.0\n";
+          status *= false;
+        }
+
+        if (fabs(E->getValues(memory::HOST)[1] - 2.0) > tol)
+        {
+          std::cerr << "Test failed: E[0][1] = " << E->getValues(memory::HOST)[1] << ", expected: 2.0\n";
+          status *= false;
+        }
+
+        if (fabs(E->getValues(memory::HOST)[2] - 1.0) > tol)
+        {
+          std::cerr << "Test failed: E[0][2] = " << E->getValues(memory::HOST)[2] << ", expected: 1.0\n";
+          status *= false;
+        }
+
+        index_type j = 3;
+        for (index_type i = 1; i < n; i++)
+        {
+          if (fabs(E->getValues(memory::HOST)[j] - (i + 1)) > tol)
+          {
+            std::cerr << "Test failed: E[" << i << "][" << i - 1 << "] = " << E->getValues(memory::HOST)[j] << ", expected: " << (i + 1) << "\n";
+            status *= false;
+          }
+          j++;
+
+          if (fabs(E->getValues(memory::HOST)[j] - (1 + (i + 1) * (i + 1))) > tol)
+          {
+            std::cerr << "Test failed: E[" << i << "][" << i << "] = " << E->getValues(memory::HOST)[j] << ", expected: " << (1 + (i + 1) * (i + 1)) << "\n";
+            status *= false;
+          }
+          j++;
+
+          if (i < n - 1 && fabs(E->getValues(memory::HOST)[j] - (i + 2)) > tol)
+          {
+            std::cerr << "Test failed: E[" << i << "][" << i + 1 << "] = " << E->getValues(memory::HOST)[j] << ", expected: " << (i + 2) << "\n";
+            status *= false;
+          }
+          if (i < n - 1)
+            j++;
+
+          if (i < n - 2 && fabs(E->getValues(memory::HOST)[j] - (1.0)) > tol)
+          {
+            std::cerr << "Test failed: E[" << i << "][" << i + 2 << "] = " << E->getValues(memory::HOST)[j] << ", expected: " << (1.0) << "\n";
+            status *= false;
+          }
+          if (i < n - 2)
+            j++;
+        }
+
+        delete[] A_row_ptr;
+        delete[] A_col_ind;
+        delete[] A_values;
+
+        delete[] B_row_ptr;
+        delete[] B_col_ind;
+        delete[] B_values;
+
+        delete[] D_row_ptr;
+        delete[] D_col_ind;
+        delete[] D_values;
+
+        delete A;
+        delete B;
+        delete D;
+        delete E;
+
+        return status.report(testname.c_str());
+      }
+
+      TestOutcome reuse()
+      {
+        TestStatus  status;
+        std::string testname(__func__);
+
+        hykkt::SpGEMM spgemm(memspace_, 2.0, 2.0);
+
+        matrix::Csr* A = nullptr;
+        matrix::Csr* B = nullptr;
+        matrix::Csr* D = nullptr;
+
+        generateExampleData(&A, &B, &D);
+
+        matrix::Csr* E = nullptr;
+
+        spgemm.loadProductMatrices(A, B);
+        spgemm.loadSumMatrix(D);
+        spgemm.loadResultMatrix(&E);
+        spgemm.compute();
+
+        if (memspace_ == memory::DEVICE)
+        {
+          E->syncData(memory::HOST);
+
+          A->syncData(memory::HOST);
+          D->syncData(memory::HOST);
+        }
+
+        status *= verifyResult(E, 2.0);
+
+        for (index_type j = 0; j < A->getNnz(); j++)
+        {
+          A->getValues(memory::HOST)[j] *= 2.0;
+        }
+        for (index_type j = 0; j < D->getNnz(); j++)
+        {
+          D->getValues(memory::HOST)[j] *= 2.0;
+        }
+        A->setUpdated(memory::HOST);
+        D->setUpdated(memory::HOST);
+
+        if (memspace_ == memory::DEVICE)
+        {
+          A->syncData(memory::DEVICE);
+          D->syncData(memory::DEVICE);
+        }
+
+        spgemm.loadProductMatrices(A, B);
+        spgemm.loadSumMatrix(D);
+        spgemm.compute();
+
+        if (memspace_ == memory::DEVICE)
+        {
+          E->syncData(memory::HOST);
+        }
+
+        status *= verifyResult(E, 4.0);
+
+        delete A;
+        delete B;
+        delete D;
+        delete E;
+
+        return status.report(testname.c_str());
+      }
+
+    private:
+      memory::MemorySpace memspace_;
+
+      void generateExampleData(matrix::Csr** A, matrix::Csr** B, matrix::Csr** D)
+      {
+        index_type* A_row_ptr = new index_type[4]{0, 1, 3, 5};
+        index_type* A_col_ind = new index_type[5]{0, 1, 2, 0, 2};
+        real_type*  A_values  = new real_type[5]{1.0, 2.0, -4.0, 5.0, 3.0};
+
+        index_type* B_row_ptr = new index_type[4]{0, 1, 3, 5};
+        index_type* B_col_ind = new index_type[5]{0, 0, 1, 1, 2};
+        real_type*  B_values  = new real_type[5]{-1.0, -2.0, 3.0, 2.0, 4.0};
+
+        index_type* D_row_ptr = new index_type[4]{0, 2, 4, 6};
+        index_type* D_col_ind = new index_type[6]{0, 2, 0, 1, 1, 2};
+        real_type*  D_values  = new real_type[6]{3.0, -4.0, -2.0, 1.0, 5.0, 7.0};
+
+        *A = new matrix::Csr(3, 3, 5);
+        *B = new matrix::Csr(3, 3, 5);
+        *D = new matrix::Csr(3, 3, 6);
+
+        (*A)->copyDataFrom(A_row_ptr, A_col_ind, A_values, memory::HOST, memspace_);
+        (*B)->copyDataFrom(B_row_ptr, B_col_ind, B_values, memory::HOST, memspace_);
+        (*D)->copyDataFrom(D_row_ptr, D_col_ind, D_values, memory::HOST, memspace_);
+
+        delete[] A_row_ptr;
+        delete[] A_col_ind;
+        delete[] A_values;
+
+        delete[] B_row_ptr;
+        delete[] B_col_ind;
+        delete[] B_values;
+
+        delete[] D_row_ptr;
+        delete[] D_col_ind;
+        delete[] D_values;
+      }
+
+      bool verifyResult(matrix::Csr* E, real_type multiplier)
+      {
+        bool is_correct = true;
+
+        index_type* E_row_ptr_expected = new index_type[4]{0, 2, 5, 8};
+        index_type* E_col_ind_expected = new index_type[8]{0, 2, 0, 1, 2, 0, 1, 2};
+        real_type*  E_values_expected  = new real_type[8]{2.0, -4.0, -6.0, -1.0, -16, -5.0, 11, 19};
+
+        for (index_type i = 0; i < 8; i++)
+        {
+          E_values_expected[i] *= multiplier;
+        }
+
+        real_type tol = 1e-8;
+        for (index_type j = 0; j < E->getNumRows() + 1; j++)
+        {
+          if (fabs(E->getRowData(memory::HOST)[j] - E_row_ptr_expected[j]) > tol)
+          {
+            std::cout << "Row pointer mismatch at index " << j << ": got "
+                      << E->getRowData(memory::HOST)[j] << ", expected " << E_row_ptr_expected[j] << "\n";
+            is_correct = false;
+          }
+        }
+
+        for (index_type j = 0; j < E->getNnz(); j++)
+        {
+          if (fabs(E->getColData(memory::HOST)[j] - E_col_ind_expected[j]) > tol)
+          {
+            std::cout << "Column index mismatch at index " << j << ": got "
+                      << E->getColData(memory::HOST)[j] << ", expected " << E_col_ind_expected[j] << "\n";
+            is_correct = false;
+          }
+        }
+
+        for (index_type j = 0; j < E->getNnz(); j++)
+        {
+          if (fabs(E->getValues(memory::HOST)[j] - E_values_expected[j]) > tol)
+          {
+            std::cout << "Value mismatch at index " << j << ": got "
+                      << E->getValues(memory::HOST)[j] << ", expected " << E_values_expected[j] << "\n";
+            is_correct = false;
+          }
+        }
+
+        delete[] E_row_ptr_expected;
+        delete[] E_col_ind_expected;
+        delete[] E_values_expected;
+
+        return is_correct;
+      }
+
+    }; // class HykktCholeskyTests
+  } // namespace tests
+} // namespace ReSolve
diff --git a/tests/unit/hykkt/runHykktSpGEMMTests.cpp b/tests/unit/hykkt/runHykktSpGEMMTests.cpp
new file mode 100644
index 000000000..d257110f1
--- /dev/null
+++ b/tests/unit/hykkt/runHykktSpGEMMTests.cpp
@@ -0,0 +1,60 @@
+/**
+ * @file runHykktSpGEMMScalingTests.hpp
+ * @author Adham Ibrahim (ibrahimas@ornl.gov)
+ * @brief Tests for class hykkt::SpGEMM
+ *
+ */
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/workspace/LinAlgWorkspaceCpu.hpp>
+#ifdef RESOLVE_USE_CUDA
+#include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
+#endif
+#ifdef RESOLVE_USE_HIP
+#include <resolve/workspace/LinAlgWorkspaceHIP.hpp>
+#endif
+
+#include "HykktSpGEMMTests.hpp"
+
+/**
+ * @brief Run tests with a given backend
+ *
+ * @param backend - string name of the hardware backend
+ * @param result - test results
+ */
+template <typename WorkspaceType>
+void runTests(const std::string& backend, ReSolve::memory::MemorySpace memspace, ReSolve::tests::TestingResults& result)
+{
+  std::cout << "Running tests on " << backend << " device:\n";
+
+  WorkspaceType workspace;
+  workspace.initializeHandles();
+
+  ReSolve::tests::HykktSpGEMMTests test(memspace);
+
+  result += test.minimalCorrectness();
+  result += test.symbolic(10);
+  result += test.symbolic(1000);
+  result += test.reuse();
+
+  std::cout << "\n";
+}
+
+int main(int, char**)
+{
+  ReSolve::tests::TestingResults result;
+  runTests<ReSolve::LinAlgWorkspaceCpu>("CPU", ReSolve::memory::HOST, result);
+
+#ifdef RESOLVE_USE_CUDA
+  runTests<ReSolve::LinAlgWorkspaceCUDA>("CUDA", ReSolve::memory::DEVICE, result);
+#endif
+
+#ifdef RESOLVE_USE_HIP
+  runTests<ReSolve::LinAlgWorkspaceHIP>("HIP", ReSolve::memory::DEVICE, result);
+#endif
+
+  return result.summary();
+}