SparseBLAS · BenBrock · Jul 5, 2025 · Jul 5, 2025 · Jul 7, 2025 · Jul 24, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,8 +7,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_FLAGS "-O3 -march=native")
 
 option(ENABLE_SANITIZERS "Enable Clang sanitizers" OFF)
-option(ENABLE_ROCSPARSE "Enable rocSPARSE" OFF)
-option(ENABLE_CUSPARSE "Enable cuSPARSE" OFF)
+option(ENABLE_SYCL_REFERENCE "Enable SYCL kernels in reference backend" OFF)
+
+# Vendor backends
+option(ENABLE_ONEMKL_SYCL "Enable oneMKL (SYCL) vendor backend" OFF)
+option(ENABLE_ARMPL "Enable ArmPL vendor backend" OFF)
+option(ENABLE_ROCSPARSE "Enable rocSPARSE vendor backend" OFF)
+option(ENABLE_CUSPARSE "Enable cuSPARSE vendor backend" OFF)
+option(ENABLE_AOCLSPARSE "Enable AOCL-Sparse vendor backend" OFF)
 
 # Get includes, which declares the `spblas` library
 add_subdirectory(include)
@@ -20,9 +26,43 @@ endif()
 # Download dependencies
 include(FetchContent)
 
+# Enable sanitizers
+if (ENABLE_SANITIZERS)
+    set(SANITIZER_FLAGS "-fsanitize=address,undefined")
+    target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
+    target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
+endif()
+
+# Initialize backend flags
 set(SPBLAS_CPU_BACKEND OFF)
 set(SPBLAS_GPU_BACKEND OFF)
 
+if (ENABLE_SYCL_REFERENCE)
+  if (ENABLE_ONEMKL_SYCL OR ENABLE_ARMPL OR ENABLE_ROCSPARSE OR ENABLE_CUSPARSE OR ENABLE_AOCLSPARSE)
+    message(FATAL_ERROR "SYCL reference backend cannot be enabled together with vendor backends")
+  endif()
+
+  # Check for SYCL support
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag("-fsycl" COMPILER_SUPPORTS_SYCL)
+
+  FetchContent_Declare(
+    sycl_thrust
+    GIT_REPOSITORY https://github.com/SparseBLAS/sycl-thrust.git
+    GIT_TAG main)
+  FetchContent_MakeAvailable(sycl_thrust)
+
+  if(COMPILER_SUPPORTS_SYCL)
+    target_compile_options(spblas INTERFACE -fsycl -fsycl-device-code-split=per_kernel)
+    # target_compile_options(spblas INTERFACE -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_90)
+
+  else()
+    message(FATAL_ERROR "Compiler does not support SYCL (-fsycl flag not available)")
+  endif()
+
+  target_compile_definitions(spblas INTERFACE SPBLAS_ENABLE_SYCL_REFERENCE)
+endif()
+
 if (ENABLE_ONEMKL_SYCL)
   set(SPBLAS_CPU_BACKEND ON)
   set(SPBLAS_GPU_BACKEND ON)
@@ -47,6 +87,23 @@ if (ENABLE_ARMPL)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ARMPL")
 endif()
 
+if (ENABLE_ROCSPARSE)
+  set(SPBLAS_GPU_BACKEND ON)
+  project(spblas LANGUAGES HIP)
+  find_package(hip REQUIRED)
+  find_package(rocsparse REQUIRED)
+  target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
+  set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
+if (ENABLE_CUSPARSE)
+  set(SPBLAS_GPU_BACKEND ON)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
+endif()
+
 if (ENABLE_AOCLSPARSE)
   set(SPBLAS_CPU_BACKEND ON)
   if (NOT DEFINED ENV{AOCLSPARSE_DIR})
@@ -77,23 +134,6 @@ if (ENABLE_AOCLSPARSE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_AOCLSPARSE")
 endif()
 
-if (ENABLE_ROCSPARSE)
-  set(SPBLAS_GPU_BACKEND ON)
-  project(spblas LANGUAGES HIP)
-  find_package(hip REQUIRED)
-  find_package(rocsparse REQUIRED)
-  target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
-  set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
-endif()
-
-if (ENABLE_CUSPARSE)
-  set(SPBLAS_GPU_BACKEND ON)
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
-endif()
-
 # If no vendor backend is enabled, enable CPU backend for reference implementation
 if (NOT ENABLE_ONEMKL_SYCL AND
     NOT ENABLE_ARMPL AND
@@ -108,13 +148,6 @@ if (LOG_LEVEL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLOG_LEVEL=${LOG_LEVEL}") # SPBLAS_DEBUG | SPBLAS_WARNING | SPBLAS_TRACE | SPBLAS_INFO
 endif()
 
-# Enable sanitizers
-if (ENABLE_SANITIZERS)
-    set(SANITIZER_FLAGS "-fsanitize=address,undefined")
-    target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
-    target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
-endif()
-
 # mdspan
 FetchContent_Declare(
   mdspan

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -23,3 +23,7 @@ if (SPBLAS_GPU_BACKEND)
     add_subdirectory(rocsparse)
   endif()
 endif()
+
+if (ENABLE_SYCL_REFERENCE)
+  add_subdirectory(sycl_reference)
+endif()
diff --git a/examples/sycl_reference/CMakeLists.txt b/examples/sycl_reference/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+function(add_sycl_example example_name)
+  add_executable(${example_name} ${example_name}.cpp)
+  target_link_libraries(${example_name} spblas fmt sycl_thrust)
+endfunction()
+
+add_sycl_example(sycl_spmm)
diff --git a/examples/sycl_reference/sycl_spmm.cpp b/examples/sycl_reference/sycl_spmm.cpp
@@ -0,0 +1,167 @@
+#include <iostream>
+#include <spblas/spblas.hpp>
+
+#include <spblas/backend/sycl/spmm_impl.hpp>
+
+#include <thrust/device_vector.h>
+
+#include <cassert>
+
+#include <cmath>
+#include <fmt/core.h>
+#include <fmt/ranges.h>
+
+int main(int argc, char** argv) {
+  using value_t = float;
+  using index_t = int32_t;
+  using offset_t = int32_t;
+  namespace md = spblas::__mdspan;
+
+  offset_t nnz_row = 100;
+
+  index_t m = 100000;
+  index_t n = 1;
+  index_t k = 100000;
+
+  char method = 'k';
+
+  std::size_t wg_size = 32;
+
+  if (argc >= 2) {
+    m = std::atoll(argv[1]);
+  }
+
+  if (argc >= 3) {
+    k = std::atoll(argv[2]);
+  }
+
+  if (argc >= 4) {
+    n = std::atoll(argv[3]);
+  }
+
+  if (argc >= 5) {
+    nnz_row = std::atoll(argv[4]);
+  }
+
+  if (argc >= 6) {
+    method = argv[5][0];
+  }
+
+  if (argc >= 7) {
+    wg_size = std::atoll(argv[6]);
+  }
+
+  assert(method == 'k' || method == 'j');
+
+  fmt::print("Multiplying {} x {} matrix with {} nnz/row by {} columns.\n", m,
+             k, nnz_row, n);
+  fmt::print("Using method {} with WG size {}\n", method, wg_size);
+
+  offset_t nnz_in = m * nnz_row;
+
+  auto&& [values, rowptr, colind, shape, nnz] =
+      spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz_in);
+
+  thrust::device_vector<value_t> d_values(values);
+  thrust::device_vector<offset_t> d_rowptr(rowptr);
+  thrust::device_vector<index_t> d_colind(colind);
+
+  spblas::csr_view<value_t, index_t, offset_t> a(
+      d_values.data().get(), d_rowptr.data().get(), d_colind.data().get(),
+      shape, nnz);
+
+  std::vector<value_t> b_values(k * n, 1);
+  std::vector<value_t> c_values(m * n, 0);
+
+  thrust::device_vector<value_t> d_b(b_values);
+  thrust::device_vector<value_t> d_c(c_values);
+
+  md::mdspan b(d_b.data().get(), k, n);
+  md::mdspan c(d_c.data().get(), m, n);
+
+  sycl::queue q(sycl::gpu_selector_v);
+
+  if (method == 'k') {
+    spblas::spmm_wgsplitk(q, a, b, c, wg_size);
+  } else {
+    spblas::spmm_wgsplitj(q, a, b, c, wg_size);
+  }
+
+  thrust::copy(d_c.begin(), d_c.end(), c_values.begin());
+
+  std::vector<value_t> c_ref(m * n, 0);
+
+  spblas::csr_view<value_t, index_t, offset_t> a_view(
+      values.data(), rowptr.data(), colind.data(), shape, nnz);
+  md::mdspan b_view(b_values.data(), k, n);
+  md::mdspan c_view(c_ref.data(), m, n);
+
+  spblas::multiply(a_view, b_view, c_view);
+
+  // Compare results
+  const float epsilon = 64 * std::numeric_limits<float>::epsilon();
+  const float abs_th = std::numeric_limits<float>::min();
+  bool results_match = true;
+
+  for (std::size_t i = 0; i < c_ref.size(); ++i) {
+    float diff = std::abs(c_ref[i] - c_values[i]);
+    float norm = std::min(std::abs(c_ref[i]) + std::abs(c_values[i]),
+                          std::numeric_limits<float>::max());
+    float abs_error = std::max(abs_th, epsilon * norm);
+
+    if (diff > abs_error) {
+      results_match = false;
+      std::cout << "Mismatch at index " << i << ": "
+                << "SYCL result = " << c_values[i]
+                << ", Reference = " << c_ref[i] << "\n";
+      break;
+    }
+  }
+
+  if (results_match) {
+    fmt::print("OK!\n");
+  } else {
+    fmt::print("Error!\n");
+    return 1;
+  }
+
+  double gb = 1e-9 * (nnz * sizeof(value_t) + nnz * sizeof(index_t) +
+                      (m + 1) * sizeof(offset_t) + k * n * sizeof(value_t) +
+                      m * n * sizeof(value_t));
+
+  std::size_t n_iterations = 10;
+
+  std::vector<double> durations;
+  durations.reserve(n_iterations);
+
+  double max_bw = 456;
+
+  for (std::size_t i = 0; i < n_iterations; i++) {
+    auto begin = std::chrono::high_resolution_clock::now();
+    if (method == 'k') {
+      spblas::spmm_wgsplitk(q, a, b, c, wg_size);
+    } else {
+      spblas::spmm_wgsplitj(q, a, b, c, wg_size);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+    double duration = std::chrono::duration<double>(end - begin).count();
+    double gb_s = gb / duration;
+
+    fmt::print("Completed in {} s (achieved {} GB/s)\n", duration, gb_s);
+
+    durations.push_back(duration);
+  }
+
+  fmt::print("Durations: {}\n", durations);
+
+  std::sort(durations.begin(), durations.end());
+
+  double median_duration = durations[durations.size() / 2];
+
+  double median_gb_s = gb / median_duration;
+
+  fmt::print("Median duration {} ({} GB/s) {}% of peak.\n", median_duration,
+             median_gb_s, 100 * (median_gb_s / max_bw));
+
+  return 0;
+}
diff --git a/include/spblas/algorithms/algorithms.hpp b/include/spblas/algorithms/algorithms.hpp
@@ -8,6 +8,11 @@
 #ifndef SPBLAS_VENDOR_BACKEND
 #include <spblas/algorithms/multiply_impl.hpp>
 #include <spblas/algorithms/triangular_solve_impl.hpp>
+
+#ifdef SPBLAS_ENABLE_SYCL_REFERENCE
+#include <spblas/backend/sycl/multiply_impl.hpp>
+#endif
+
 #endif
 
 #include <spblas/algorithms/add.hpp>

diff --git a/include/spblas/backend/sycl/multiply_impl.hpp b/include/spblas/backend/sycl/multiply_impl.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <spblas/backend/sycl/spmm_impl.hpp>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#pragma once

		#include <spblas/backend/sycl/spmm_impl.hpp>