Skip to content
This repository was archived by the owner on Sep 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
00f1e39
Add initial implementation of sparse matrix in mp
Aug 14, 2024
36d7f38
Fixed row shape calculation
Aug 14, 2024
dd57bb1
Extract matrix format from matrix implementation
Aug 19, 2024
b84ecc0
Add initial gemv implementation
Aug 21, 2024
1c1dad7
Move matrix related files from sp to general module
Aug 21, 2024
2456379
Separated matrix format from mp sparse matrix implementation and adde…
Aug 27, 2024
bd63c2c
Improve matrix loading performance
Aug 28, 2024
b75b7ed
Add sycl support to mp sparse matrixes
Sep 3, 2024
756fab1
Added initialization from one node in mp sparse matrix
Sep 4, 2024
0b498ad
Add concept requirement for gemv operation
Sep 4, 2024
bbf2acf
Initial improvement to matrix reading
Sep 4, 2024
9eca244
Add small improvements to matrix loading
Sep 9, 2024
7b55a1b
Fix formatting
Sep 9, 2024
bb2e02e
Add sparse benchmark and broadcasted vector
Sep 17, 2024
18165da
Add benchmarking tools
Sep 17, 2024
94f818e
Add gemv benchmark to gbench
Sep 18, 2024
982a0e0
Add reference gemv implementation
Sep 24, 2024
47a8455
Fixed gemv reference
Sep 24, 2024
a97a97b
Fixed gemv benchmark implementation
Sep 25, 2024
231a09a
Fix band csr generation
Sep 30, 2024
6b8af49
Add support for slim matrix multiplication
Oct 1, 2024
628aa07
Fix benchmark and band csr generation
Oct 1, 2024
2047ecf
Merge branch 'benchmark' of github.com:Xewar313/distributed-ranges in…
Oct 1, 2024
4f12327
Add support to device based computing in distributed sparse matrix
Oct 2, 2024
71bd336
add broadcasted slim matrix device memory support
Oct 2, 2024
6f96929
Fix issue with inconsistent timing when using mp gemv
Oct 7, 2024
08a2247
Some fixes to sparse matrixes
Oct 8, 2024
6a4bd30
improve work division in csr eq distribution
Oct 9, 2024
f93961b
Add better work distribution to csr_row_distiribution and fix distrib…
Oct 16, 2024
e421523
improve performance on less dense matrices and allow broadcasting big…
Oct 18, 2024
3fa4a68
Reversed change to eq distribution
Oct 18, 2024
0a1a4dc
update some examples and benchmarks
Oct 25, 2024
2beec18
Improved communication in eq distribution
Nov 4, 2024
5edd0ba
Improve equ format on very sparse matrices
Nov 5, 2024
5800f99
Merge branch 'main' into benchmark
Nov 5, 2024
cae67ef
Fix test compilation
Nov 5, 2024
28519e0
Reformat changes in mp matrix #1
Nov 5, 2024
f2c2fbe
Fix and improve tests
Nov 8, 2024
3a00d61
Add tests for sparse gemm in mp
Nov 8, 2024
2ec4b21
Reformat changes in mp matrix #2
Nov 8, 2024
05f5c63
Fix compilation on borealis
Nov 12, 2024
06a6628
fix compilation
Nov 13, 2024
aa706f7
Fix issues with very small and very big matrices
Nov 13, 2024
6e0e9d2
Merge branch 'main' into benchmark
Nov 13, 2024
8f1a2b7
Fix compilation on older OneDpl
Nov 13, 2024
28e023e
Fix style
Nov 13, 2024
e4ee8c7
Merge onedpl fix
Nov 13, 2024
55185dc
Some fixes with verions
Nov 13, 2024
b7704ea
Add local to csr_eq_segment
Nov 20, 2024
4acbad6
Add proper local method
Nov 22, 2024
1f84ba7
Add problem to review
Nov 25, 2024
ba20ee3
Moved local view to distribution
Nov 25, 2024
bad5606
Add new example of not working code
Nov 25, 2024
8e7f1fe
Fix issue with lambda copy
Nov 27, 2024
3503271
Make local work with shared memory
Nov 27, 2024
44a6e78
Fix device memory when using local in row distribution
Nov 29, 2024
2bf503e
Fix local in eq distribution
Dec 2, 2024
dc89bc8
Fix formatting
Dec 2, 2024
7e7f2d2
Reverse change in dr::transform_view
Dec 2, 2024
dd1d6ed
Fix benchmark when default vector size is small
Dec 2, 2024
e42cfa2
Fix issue when distributed vector is too small
Dec 3, 2024
2318a46
Improve performance of eq distribution gather
Dec 3, 2024
4cfb110
Remove unneccessary comment
Dec 3, 2024
04191d7
Add test for reduce and fix type error in sparse matrix local
Dec 9, 2024
adad4f7
Add broadcast_vector tests
Dec 9, 2024
f17243b
Fix formatting
Dec 9, 2024
f1639b0
Corrected gemv matrix creation
Jan 13, 2025
3dfdac0
Fix formatting
Jan 13, 2025
818e848
Fixed PR comments
Jan 17, 2025
5a70436
Fix formatting
Jan 17, 2025
eb66f7a
Fixed format 2
Jan 17, 2025
f9bbc1d
Fix gather call
Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/gbench/mp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ add_executable(
../common/stream.cpp
streammp.cpp
rooted.cpp
gemv.cpp
stencil_1d.cpp
stencil_2d.cpp
chunk.cpp
Expand All @@ -41,7 +42,7 @@ endif()
# mp-quick-bench is for development. By reducing the number of source files, it
# builds much faster. Change the source files to match what you need to test. It
# is OK to commit changes to the source file list.
add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp)
add_executable(mp-quick-bench mp-bench.cpp gemv.cpp)

foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench)
target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP)
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/gbench/mp/fft3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
#include "cxxopts.hpp"
#include "fmt/core.h"
#include "mpi.h"
#if (__INTEL_LLVM_COMPILER >= 20250000)
#include "oneapi/mkl/dft.hpp"
#else
#include "oneapi/mkl/dfti.hpp"
#endif
#include <complex>

#include "dr/mp.hpp"
Expand Down
192 changes: 192 additions & 0 deletions benchmarks/gbench/mp/gemv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// SPDX-FileCopyrightText: Intel Corporation
//
// SPDX-License-Identifier: BSD-3-Clause

#include "mpi.h"

#include "../common/dr_bench.hpp"
#include "dr/mp.hpp"
#include <filesystem>
#include <fmt/core.h>
#include <fstream>
#include <random>
#include <sstream>

namespace mp = dr::mp;

namespace {
std::size_t getWidth() {
return 8; // default_vector_size / 100000;
}
} // namespace
static auto getMatrix() {
// size below is useful when testing weak scaling with default vector size
// using dr-bench it creates matrix which non-zero element count increases
// linearly when we increase default_vector_size std::size_t n = std::max(1.,
// std::sqrt(default_vector_size / 100000)) * 50000;

std::size_t density_scalar = 50;

std::size_t n =
std::max(1., std::sqrt(default_vector_size * density_scalar / 2));

std::size_t up = n / density_scalar;
std::size_t down = n / density_scalar;
fmt::print("Generate matrix");
auto tmp = dr::generate_band_csr<double, long>(n, up, down);
fmt::print("generated!");
return tmp;
}

static void GemvEq_DR(benchmark::State &state) {
auto local_data = getMatrix();

mp::distributed_sparse_matrix<
double, long, dr::mp::MpiBackend,
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
m(local_data, 0);
auto n = m.shape()[1];
auto width = getWidth();
std::vector<double> base_a(n * width);
for (int j = 0; j < width; j++) {
for (int i = 0; i < n; i++) {
base_a[i + j * n] = i * j + 1;
}
}
dr::mp::broadcasted_slim_matrix<double> allocated_a;
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());

std::vector<double> res(m.shape().first * width);
gemv(0, res, m, allocated_a);
for (auto _ : state) {
gemv(0, res, m, allocated_a);
}
}

DR_BENCHMARK(GemvEq_DR);

static void GemvRow_DR(benchmark::State &state) {
auto local_data = getMatrix();

mp::distributed_sparse_matrix<
double, long, dr::mp::MpiBackend,
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
m(local_data, 0);
auto n = m.shape()[1];
auto width = getWidth();
std::vector<double> base_a(n * width);
for (int j = 0; j < width; j++) {
for (int i = 0; i < n; i++) {
base_a[i + j * n] = i * j + 1;
}
}
dr::mp::broadcasted_slim_matrix<double> allocated_a;
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());

std::vector<double> res(m.shape().first * width);
gemv(0, res, m, allocated_a);
for (auto _ : state) {
gemv(0, res, m, allocated_a);
}
}

DR_BENCHMARK(GemvRow_DR);

static void Gemv_Reference(benchmark::State &state) {
auto local_data = getMatrix();
auto nnz_count = local_data.size();
auto band_shape = local_data.shape();
auto q = get_queue();
auto policy = oneapi::dpl::execution::make_device_policy(q);
auto val_ptr = sycl::malloc_device<double>(nnz_count, q);
auto col_ptr = sycl::malloc_device<long>(nnz_count, q);
auto row_ptr = sycl::malloc_device<long>((band_shape[0] + 1), q);
std::vector<double> b;
auto width = getWidth();
for (auto i = 0; i < band_shape[1] * width; i++) {
b.push_back(i);
}
double *elems = new double[band_shape[0] * width];
auto input = sycl::malloc_device<double>(band_shape[1] * width, q);
auto output = sycl::malloc_device<double>(band_shape[0] * width, q);
q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double))
.wait();
q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait();
q.memcpy(row_ptr, local_data.rowptr_data(),
(band_shape[0] + 1) * sizeof(long))
.wait();
q.fill(output, 0, band_shape[0] * width);
std::copy(policy, b.begin(), b.end(), input);

auto wg = 32;
while (width * band_shape[0] * wg > INT_MAX) {
wg /= 2;
}
assert(wg > 0);

for (auto _ : state) {
if (dr::mp::use_sycl()) {
dr::mp::sycl_queue()
.submit([&](auto &&h) {
h.parallel_for(
sycl::nd_range<1>(width * band_shape[0] * wg, wg),
[=](auto item) {
auto input_j = item.get_group(0) / band_shape[0];
auto idx = item.get_group(0) % band_shape[0];
auto local_id = item.get_local_id();
auto group_size = item.get_local_range(0);
double sum = 0;
auto start = row_ptr[idx];
auto end = row_ptr[idx + 1];
for (auto i = start + local_id; i < end; i += group_size) {
auto colNum = col_ptr[i];
auto vectorVal = input[colNum + input_j * band_shape[1]];
auto matrixVal = val_ptr[i];
sum += matrixVal * vectorVal;
}
sycl::atomic_ref<double, sycl::memory_order::relaxed,
sycl::memory_scope::device>
c_ref(output[idx + band_shape[0] * input_j]);
c_ref += sum;
});
})
.wait();
q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait();
} else {
std::fill(elems, elems + band_shape[0] * width, 0);
auto local_rows = local_data.rowptr_data();
auto row_i = 0;
auto current_row_position = local_rows[1];

for (int i = 0; i < nnz_count; i++) {
while (row_i + 1 < band_shape[0] && i >= current_row_position) {
row_i++;
current_row_position = local_rows[row_i + 1];
}
for (auto j = 0; j < width; j++) {
auto item_id = row_i + j * band_shape[0];
auto val_index = local_data.colind_data()[i] + j * band_shape[0];
auto value = b[val_index];
auto matrix_value = local_data.values_data()[i];
elems[item_id] += matrix_value * value;
}
}
}
}
delete[] elems;
sycl::free(val_ptr, q);
sycl::free(col_ptr, q);
sycl::free(row_ptr, q);
sycl::free(input, q);
sycl::free(output, q);
}

static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); }

static void GemvRow_Reference(benchmark::State &state) {
Gemv_Reference(state);
}

DR_BENCHMARK(GemvEq_Reference);

DR_BENCHMARK(GemvRow_Reference);
4 changes: 4 additions & 0 deletions benchmarks/gbench/sp/fft3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
// SPDX-License-Identifier: BSD-3-Clause

#include "cxxopts.hpp"
#if (__INTEL_LLVM_COMPILER >= 20250000)
#include "oneapi/mkl/dft.hpp"
#else
#include "oneapi/mkl/dfti.hpp"
#endif
#include <complex>
#include <dr/sp.hpp>
#include <fmt/core.h>
Expand Down
9 changes: 8 additions & 1 deletion examples/mp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,23 @@ add_executable(vector-add vector-add.cpp)
target_link_libraries(vector-add DR::mpi)
add_mp_ctest(TEST_NAME vector-add NAME vector-add NPROC 2)

function(add_mp_example example_name)
function(add_mp_example_no_test example_name)
add_executable(${example_name} ${example_name}.cpp)
target_link_libraries(${example_name} cxxopts DR::mpi)
endfunction()

function(add_mp_example example_name)
add_mp_example_no_test(${example_name})
add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2)
endfunction()

add_mp_example(stencil-1d)
add_mp_example(stencil-1d-array)
add_mp_example(stencil-1d-pointer)
add_mp_example(hello_world)
add_mp_example_no_test(sparse_matrix)
add_mp_example_no_test(sparse_benchmark)
add_mp_example_no_test(sparse_matrix_matrix_mul)

if(OpenMP_FOUND)
add_executable(vector-add-ref vector-add-ref.cpp)
Expand Down
130 changes: 130 additions & 0 deletions examples/mp/sparse_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// SPDX-FileCopyrightText: Intel Corporation
//
// SPDX-License-Identifier: BSD-3-Clause

#include <dr/mp.hpp>
#include <filesystem>
#include <fmt/core.h>
#include <fstream>
#include <random>
#include <sstream>

namespace mp = dr::mp;

MPI_Comm comm;
int comm_rank;
int comm_size;

int main(int argc, char **argv) {

MPI_Init(&argc, &argv);
comm = MPI_COMM_WORLD;
MPI_Comm_rank(comm, &comm_rank);
MPI_Comm_size(comm, &comm_size);

if (argc != 3 && argc != 5) {
fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market "
"file], or ./sparse_benchmark [test outcome dir] [number of "
"rows] [number of columns] [density]\n");
return 1;
}

#ifdef SYCL_LANGUAGE_VERSION
sycl::queue q = dr::mp::select_queue();
mp::init(q);
#else
mp::init();
#endif
dr::views::csr_matrix_view<double, long> local_data;
std::stringstream filenamestream;
auto root = 0;
auto computeSize = dr::mp::default_comm().size();
if (root == dr::mp::default_comm().rank()) {
if (argc == 5) {
fmt::print("started loading\n");
auto n = std::stoul(argv[2]);
auto up = std::stoul(argv[3]);
auto down = std::stoul(argv[4]);
// local_data = dr::generate_random_csr<double, long>({n, m}, density,
// 42);
local_data = dr::generate_band_csr<double, long>(n, up, down);
filenamestream << "mp_band_" << computeSize << "_" << n << "_"
<< up + down << "_" << local_data.size();
fmt::print("finished loading\n");
} else {
fmt::print("started loading\n");
std::string fname(argv[2]);
std::filesystem::path p(argv[2]);
local_data = dr::read_csr<double, long>(fname);
filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_"
<< local_data.size();
fmt::print("finished loading\n");
}
}
std::string resname;
mp::distributed_sparse_matrix<
double, long, dr::mp::MpiBackend,
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
m_eq(local_data, root);
mp::distributed_sparse_matrix<
double, long, dr::mp::MpiBackend,
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
m_row(local_data, root);
fmt::print("finished distribution\n");
std::vector<double> eq_duration;
std::vector<double> row_duration;

auto N = 10;
std::vector<double> b;
b.reserve(m_row.shape().second);
std::vector<double> res(m_row.shape().first);
for (auto i = 0; i < m_row.shape().second; i++) {
b.push_back(i);
}

dr::mp::broadcasted_vector<double> allocated_b;
allocated_b.broadcast_data(m_row.shape().second, 0, b,
dr::mp::default_comm());

fmt::print("started initial gemv distribution\n");
gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work

fmt::print("finished initial gemv distribution\n");
for (auto i = 0; i < N; i++) {
auto begin = std::chrono::high_resolution_clock::now();
gemv(0, res, m_eq, allocated_b);
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - begin).count() * 1000;
eq_duration.push_back(duration);
}

gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work
for (auto i = 0; i < N; i++) {
auto begin = std::chrono::high_resolution_clock::now();
gemv(0, res, m_row, allocated_b);
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - begin).count() * 1000;
row_duration.push_back(duration);
}

if (root == dr::mp::default_comm().rank()) {
std::string tmp;
filenamestream >> tmp;
std::filesystem::path p(argv[1]);
p += tmp;
p += ".csv";
std::ofstream write_stream(p.string());
write_stream << eq_duration.front();
for (auto i = 1; i < N; i++) {
write_stream << "," << eq_duration[i];
}
write_stream << "\n";
write_stream << row_duration.front();
for (auto i = 1; i < N; i++) {
write_stream << "," << row_duration[i];
}
write_stream << "\n";
}
allocated_b.destroy_data();
mp::finalize();
}
Loading