Skip to content
Draft
85 changes: 59 additions & 26 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "-O3 -march=native")

option(ENABLE_SANITIZERS "Enable Clang sanitizers" OFF)
option(ENABLE_ROCSPARSE "Enable rocSPARSE" OFF)
option(ENABLE_CUSPARSE "Enable cuSPARSE" OFF)
option(ENABLE_SYCL_REFERENCE "Enable SYCL kernels in reference backend" OFF)

# Vendor backends
option(ENABLE_ONEMKL_SYCL "Enable oneMKL (SYCL) vendor backend" OFF)
option(ENABLE_ARMPL "Enable ArmPL vendor backend" OFF)
option(ENABLE_ROCSPARSE "Enable rocSPARSE vendor backend" OFF)
option(ENABLE_CUSPARSE "Enable cuSPARSE vendor backend" OFF)
option(ENABLE_AOCLSPARSE "Enable AOCL-Sparse vendor backend" OFF)

# Get includes, which declares the `spblas` library
add_subdirectory(include)
Expand All @@ -20,9 +26,43 @@ endif()
# Download dependencies
include(FetchContent)

# Enable sanitizers
if (ENABLE_SANITIZERS)
set(SANITIZER_FLAGS "-fsanitize=address,undefined")
target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
endif()

# Initialize backend flags
set(SPBLAS_CPU_BACKEND OFF)
set(SPBLAS_GPU_BACKEND OFF)

if (ENABLE_SYCL_REFERENCE)
if (ENABLE_ONEMKL_SYCL OR ENABLE_ARMPL OR ENABLE_ROCSPARSE OR ENABLE_CUSPARSE OR ENABLE_AOCLSPARSE)
message(FATAL_ERROR "SYCL reference backend cannot be enabled together with vendor backends")
endif()

# Check for SYCL support
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-fsycl" COMPILER_SUPPORTS_SYCL)

FetchContent_Declare(
sycl_thrust
GIT_REPOSITORY https://github.com/SparseBLAS/sycl-thrust.git
GIT_TAG main)
FetchContent_MakeAvailable(sycl_thrust)

if(COMPILER_SUPPORTS_SYCL)
target_compile_options(spblas INTERFACE -fsycl -fsycl-device-code-split=per_kernel)
# target_compile_options(spblas INTERFACE -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_90)

else()
message(FATAL_ERROR "Compiler does not support SYCL (-fsycl flag not available)")
endif()

target_compile_definitions(spblas INTERFACE SPBLAS_ENABLE_SYCL_REFERENCE)
endif()

if (ENABLE_ONEMKL_SYCL)
set(SPBLAS_CPU_BACKEND ON)
set(SPBLAS_GPU_BACKEND ON)
Expand All @@ -47,6 +87,23 @@ if (ENABLE_ARMPL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ARMPL")
endif()

if (ENABLE_ROCSPARSE)
set(SPBLAS_GPU_BACKEND ON)
project(spblas LANGUAGES HIP)
find_package(hip REQUIRED)
find_package(rocsparse REQUIRED)
target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

if (ENABLE_CUSPARSE)
set(SPBLAS_GPU_BACKEND ON)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
endif()

if (ENABLE_AOCLSPARSE)
set(SPBLAS_CPU_BACKEND ON)
if (NOT DEFINED ENV{AOCLSPARSE_DIR})
Expand Down Expand Up @@ -77,23 +134,6 @@ if (ENABLE_AOCLSPARSE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_AOCLSPARSE")
endif()

if (ENABLE_ROCSPARSE)
set(SPBLAS_GPU_BACKEND ON)
project(spblas LANGUAGES HIP)
find_package(hip REQUIRED)
find_package(rocsparse REQUIRED)
target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

if (ENABLE_CUSPARSE)
set(SPBLAS_GPU_BACKEND ON)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
endif()

# If no vendor backend is enabled, enable CPU backend for reference implementation
if (NOT ENABLE_ONEMKL_SYCL AND
NOT ENABLE_ARMPL AND
Expand All @@ -108,13 +148,6 @@ if (LOG_LEVEL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLOG_LEVEL=${LOG_LEVEL}") # SPBLAS_DEBUG | SPBLAS_WARNING | SPBLAS_TRACE | SPBLAS_INFO
endif()

# Enable sanitizers
if (ENABLE_SANITIZERS)
set(SANITIZER_FLAGS "-fsanitize=address,undefined")
target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
endif()

# mdspan
FetchContent_Declare(
mdspan
Expand Down
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@ if (SPBLAS_GPU_BACKEND)
add_subdirectory(rocsparse)
endif()
endif()

if (ENABLE_SYCL_REFERENCE)
add_subdirectory(sycl_reference)
endif()
7 changes: 7 additions & 0 deletions examples/sycl_reference/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

function(add_sycl_example example_name)
add_executable(${example_name} ${example_name}.cpp)
target_link_libraries(${example_name} spblas fmt sycl_thrust)
endfunction()

add_sycl_example(sycl_spmm)
167 changes: 167 additions & 0 deletions examples/sycl_reference/sycl_spmm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include <iostream>
#include <spblas/spblas.hpp>

#include <spblas/backend/sycl/spmm_impl.hpp>

#include <thrust/device_vector.h>

#include <cassert>

#include <cmath>
#include <fmt/core.h>
#include <fmt/ranges.h>

int main(int argc, char** argv) {
using value_t = float;
using index_t = int32_t;
using offset_t = int32_t;
namespace md = spblas::__mdspan;

offset_t nnz_row = 100;

index_t m = 100000;
index_t n = 1;
index_t k = 100000;

char method = 'k';

std::size_t wg_size = 32;

if (argc >= 2) {
m = std::atoll(argv[1]);
}

if (argc >= 3) {
k = std::atoll(argv[2]);
}

if (argc >= 4) {
n = std::atoll(argv[3]);
}

if (argc >= 5) {
nnz_row = std::atoll(argv[4]);
}

if (argc >= 6) {
method = argv[5][0];
}

if (argc >= 7) {
wg_size = std::atoll(argv[6]);
}

assert(method == 'k' || method == 'j');

fmt::print("Multiplying {} x {} matrix with {} nnz/row by {} columns.\n", m,
k, nnz_row, n);
fmt::print("Using method {} with WG size {}\n", method, wg_size);

offset_t nnz_in = m * nnz_row;

auto&& [values, rowptr, colind, shape, nnz] =
spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz_in);

thrust::device_vector<value_t> d_values(values);
thrust::device_vector<offset_t> d_rowptr(rowptr);
thrust::device_vector<index_t> d_colind(colind);

spblas::csr_view<value_t, index_t, offset_t> a(
d_values.data().get(), d_rowptr.data().get(), d_colind.data().get(),
shape, nnz);

std::vector<value_t> b_values(k * n, 1);
std::vector<value_t> c_values(m * n, 0);

thrust::device_vector<value_t> d_b(b_values);
thrust::device_vector<value_t> d_c(c_values);

md::mdspan b(d_b.data().get(), k, n);
md::mdspan c(d_c.data().get(), m, n);

sycl::queue q(sycl::gpu_selector_v);

if (method == 'k') {
spblas::spmm_wgsplitk(q, a, b, c, wg_size);
} else {
spblas::spmm_wgsplitj(q, a, b, c, wg_size);
}

thrust::copy(d_c.begin(), d_c.end(), c_values.begin());

std::vector<value_t> c_ref(m * n, 0);

spblas::csr_view<value_t, index_t, offset_t> a_view(
values.data(), rowptr.data(), colind.data(), shape, nnz);
md::mdspan b_view(b_values.data(), k, n);
md::mdspan c_view(c_ref.data(), m, n);

spblas::multiply(a_view, b_view, c_view);

// Compare results
const float epsilon = 64 * std::numeric_limits<float>::epsilon();
const float abs_th = std::numeric_limits<float>::min();
bool results_match = true;

for (std::size_t i = 0; i < c_ref.size(); ++i) {
float diff = std::abs(c_ref[i] - c_values[i]);
float norm = std::min(std::abs(c_ref[i]) + std::abs(c_values[i]),
std::numeric_limits<float>::max());
float abs_error = std::max(abs_th, epsilon * norm);

if (diff > abs_error) {
results_match = false;
std::cout << "Mismatch at index " << i << ": "
<< "SYCL result = " << c_values[i]
<< ", Reference = " << c_ref[i] << "\n";
break;
}
}

if (results_match) {
fmt::print("OK!\n");
} else {
fmt::print("Error!\n");
return 1;
}

double gb = 1e-9 * (nnz * sizeof(value_t) + nnz * sizeof(index_t) +
(m + 1) * sizeof(offset_t) + k * n * sizeof(value_t) +
m * n * sizeof(value_t));

Comment on lines +158 to +161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

spmm is one of the few sparse algorithms that has potential of getting into compute bound region instead of just memory bound, so calculating gflops is also helpful. all others should just be looked at compared to the gb memory limits.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it can happen because of the potential reuse of B dense matrix if we are careful from cache, while streaming A matrix and limiting accesses to C (along with trying to not cache C at all)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, for measuring peak perf of a kernel, it is a good idea to have a warmup loop with several iterations untimed, then a timed run loop that in aggregate takes on order of seconds or at least ms to run, with average time per run computed and recorded. this increases the change of repeatability and stability of measurement and runs over time and makes them much more comparable.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made a few updates that do both things: compute GFLOPs in addition to BW achieved, and do up to a 2 second warmup before timing.

std::size_t n_iterations = 10;

std::vector<double> durations;
durations.reserve(n_iterations);

double max_bw = 456;

for (std::size_t i = 0; i < n_iterations; i++) {
auto begin = std::chrono::high_resolution_clock::now();
if (method == 'k') {
spblas::spmm_wgsplitk(q, a, b, c, wg_size);
} else {
spblas::spmm_wgsplitj(q, a, b, c, wg_size);
}
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - begin).count();
double gb_s = gb / duration;

fmt::print("Completed in {} s (achieved {} GB/s)\n", duration, gb_s);

durations.push_back(duration);
}

fmt::print("Durations: {}\n", durations);

std::sort(durations.begin(), durations.end());

double median_duration = durations[durations.size() / 2];

double median_gb_s = gb / median_duration;

fmt::print("Median duration {} ({} GB/s) {}% of peak.\n", median_duration,
median_gb_s, 100 * (median_gb_s / max_bw));

return 0;
}
5 changes: 5 additions & 0 deletions include/spblas/algorithms/algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
#ifndef SPBLAS_VENDOR_BACKEND
#include <spblas/algorithms/multiply_impl.hpp>
#include <spblas/algorithms/triangular_solve_impl.hpp>

#ifdef SPBLAS_ENABLE_SYCL_REFERENCE
#include <spblas/backend/sycl/multiply_impl.hpp>
#endif

#endif

#include <spblas/algorithms/add.hpp>
Expand Down
3 changes: 3 additions & 0 deletions include/spblas/backend/sycl/multiply_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once

#include <spblas/backend/sycl/spmm_impl.hpp>
Loading