This repository was archived by the owner on Sep 22, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
Add sparse matrix to mp #860
Merged
Merged
Changes from all commits
Commits
Show all changes
72 commits
Select commit
Hold shift + click to select a range
00f1e39
Add initial implementation of sparse matrix in mp
36d7f38
Fixed row shape calculation
dd57bb1
Extract matrix format from matrix implementation
b84ecc0
Add initial gemv implementation
1c1dad7
Move matrix related files from sp to general module
2456379
Separated matrix format from mp sparse matrix implementation and adde…
bd63c2c
Improve matrix loading performance
b75b7ed
Add sycl support to mp sparse matrixes
756fab1
Added initialization from one node in mp sparse matrix
0b498ad
Add concept requirement for gemv operation
bbf2acf
Initial improvement to matrix reading
9eca244
Add small improvements to matrix loading
7b55a1b
Fix formatting
bb2e02e
Add sparse benchmark and broadcasted vector
18165da
Add benchmarking tools
94f818e
Add gemv benchmark to gbench
982a0e0
Add reference gemv implementation
47a8455
Fixed gemv reference
a97a97b
Fixed gemv benchmark implementation
231a09a
Fix band csr generation
6b8af49
Add support for slim matrix multiplication
628aa07
Fix benchmark and band csr generation
2047ecf
Merge branch 'benchmark' of github.com:Xewar313/distributed-ranges in…
4f12327
Add support to device based computing in distributed sparse matrix
71bd336
add broadcasted slim matrix device memory support
6f96929
Fix issue with inconsistent timing when using mp gemv
08a2247
Some fixes to sparse matrixes
6a4bd30
improve work division in csr eq distribution
f93961b
Add better work distribution to csr_row_distiribution and fix distrib…
e421523
improve performance on less dense matrices and allow broadcasting big…
3fa4a68
Reversed change to eq distribution
0a1a4dc
update some examples and benchmarks
2beec18
Improved communication in eq distribution
5edd0ba
Improve equ format on very sparse matrices
5800f99
Merge branch 'main' into benchmark
cae67ef
Fix test compilation
28519e0
Reformat changes in mp matrix #1
f2c2fbe
Fix and improve tests
3a00d61
Add tests for sparse gemm in mp
2ec4b21
Reformat changes in mp matrix #2
05f5c63
Fix compilation on borealis
06a6628
fix compilation
aa706f7
Fix issues with very small and very big matrices
6e0e9d2
Merge branch 'main' into benchmark
8f1a2b7
Fix compilation on older OneDpl
28e023e
Fix style
e4ee8c7
Merge onedpl fix
55185dc
Some fixes with verions
b7704ea
Add local to csr_eq_segment
4acbad6
Add proper local method
1f84ba7
Add problem to review
ba20ee3
Moved local view to distribution
bad5606
Add new example of not working code
8e7f1fe
Fix issue with lambda copy
3503271
Make local work with shared memory
44a6e78
Fix device memory when using local in row distribution
2bf503e
Fix local in eq distribution
dc89bc8
Fix formatting
7e7f2d2
Reverse change in dr::transform_view
dd1d6ed
Fix benchmark when default vector size is small
e42cfa2
Fix issue when distributed vector is too small
2318a46
Improve performance of eq distribution gather
4cfb110
Remove unneccessary comment
04191d7
Add test for reduce and fix type error in sparse matrix local
adad4f7
Add broadcast_vector tests
f17243b
Fix formatting
f1639b0
Corrected gemv matrix creation
3dfdac0
Fix formatting
818e848
Fixed PR comments
5a70436
Fix formatting
eb66f7a
Fixed format 2
f9bbc1d
Fix gather call
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
// SPDX-FileCopyrightText: Intel Corporation | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#include "mpi.h" | ||
|
||
#include "../common/dr_bench.hpp" | ||
#include "dr/mp.hpp" | ||
#include <filesystem> | ||
#include <fmt/core.h> | ||
#include <fstream> | ||
#include <random> | ||
#include <sstream> | ||
|
||
namespace mp = dr::mp; | ||
|
||
namespace { | ||
std::size_t getWidth() { | ||
return 8; // default_vector_size / 100000; | ||
} | ||
} // namespace | ||
static auto getMatrix() { | ||
// size below is useful when testing weak scaling with default vector size | ||
// using dr-bench it creates matrix which non-zero element count increases | ||
// linearly when we increase default_vector_size std::size_t n = std::max(1., | ||
// std::sqrt(default_vector_size / 100000)) * 50000; | ||
|
||
std::size_t density_scalar = 50; | ||
|
||
std::size_t n = | ||
std::max(1., std::sqrt(default_vector_size * density_scalar / 2)); | ||
|
||
std::size_t up = n / density_scalar; | ||
std::size_t down = n / density_scalar; | ||
fmt::print("Generate matrix"); | ||
auto tmp = dr::generate_band_csr<double, long>(n, up, down); | ||
fmt::print("generated!"); | ||
return tmp; | ||
} | ||
|
||
static void GemvEq_DR(benchmark::State &state) { | ||
auto local_data = getMatrix(); | ||
|
||
mp::distributed_sparse_matrix< | ||
double, long, dr::mp::MpiBackend, | ||
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>> | ||
m(local_data, 0); | ||
auto n = m.shape()[1]; | ||
auto width = getWidth(); | ||
std::vector<double> base_a(n * width); | ||
for (int j = 0; j < width; j++) { | ||
for (int i = 0; i < n; i++) { | ||
base_a[i + j * n] = i * j + 1; | ||
} | ||
} | ||
dr::mp::broadcasted_slim_matrix<double> allocated_a; | ||
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); | ||
|
||
std::vector<double> res(m.shape().first * width); | ||
gemv(0, res, m, allocated_a); | ||
for (auto _ : state) { | ||
gemv(0, res, m, allocated_a); | ||
} | ||
} | ||
|
||
DR_BENCHMARK(GemvEq_DR); | ||
|
||
static void GemvRow_DR(benchmark::State &state) { | ||
auto local_data = getMatrix(); | ||
|
||
mp::distributed_sparse_matrix< | ||
double, long, dr::mp::MpiBackend, | ||
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>> | ||
m(local_data, 0); | ||
auto n = m.shape()[1]; | ||
auto width = getWidth(); | ||
std::vector<double> base_a(n * width); | ||
for (int j = 0; j < width; j++) { | ||
for (int i = 0; i < n; i++) { | ||
base_a[i + j * n] = i * j + 1; | ||
} | ||
} | ||
dr::mp::broadcasted_slim_matrix<double> allocated_a; | ||
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); | ||
|
||
std::vector<double> res(m.shape().first * width); | ||
gemv(0, res, m, allocated_a); | ||
for (auto _ : state) { | ||
gemv(0, res, m, allocated_a); | ||
} | ||
} | ||
|
||
DR_BENCHMARK(GemvRow_DR); | ||
|
||
static void Gemv_Reference(benchmark::State &state) { | ||
auto local_data = getMatrix(); | ||
auto nnz_count = local_data.size(); | ||
auto band_shape = local_data.shape(); | ||
auto q = get_queue(); | ||
auto policy = oneapi::dpl::execution::make_device_policy(q); | ||
auto val_ptr = sycl::malloc_device<double>(nnz_count, q); | ||
auto col_ptr = sycl::malloc_device<long>(nnz_count, q); | ||
auto row_ptr = sycl::malloc_device<long>((band_shape[0] + 1), q); | ||
std::vector<double> b; | ||
auto width = getWidth(); | ||
for (auto i = 0; i < band_shape[1] * width; i++) { | ||
b.push_back(i); | ||
} | ||
double *elems = new double[band_shape[0] * width]; | ||
auto input = sycl::malloc_device<double>(band_shape[1] * width, q); | ||
auto output = sycl::malloc_device<double>(band_shape[0] * width, q); | ||
q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)) | ||
.wait(); | ||
q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); | ||
q.memcpy(row_ptr, local_data.rowptr_data(), | ||
(band_shape[0] + 1) * sizeof(long)) | ||
.wait(); | ||
q.fill(output, 0, band_shape[0] * width); | ||
std::copy(policy, b.begin(), b.end(), input); | ||
|
||
auto wg = 32; | ||
while (width * band_shape[0] * wg > INT_MAX) { | ||
wg /= 2; | ||
} | ||
assert(wg > 0); | ||
|
||
for (auto _ : state) { | ||
if (dr::mp::use_sycl()) { | ||
dr::mp::sycl_queue() | ||
.submit([&](auto &&h) { | ||
h.parallel_for( | ||
sycl::nd_range<1>(width * band_shape[0] * wg, wg), | ||
[=](auto item) { | ||
auto input_j = item.get_group(0) / band_shape[0]; | ||
auto idx = item.get_group(0) % band_shape[0]; | ||
auto local_id = item.get_local_id(); | ||
auto group_size = item.get_local_range(0); | ||
double sum = 0; | ||
auto start = row_ptr[idx]; | ||
auto end = row_ptr[idx + 1]; | ||
for (auto i = start + local_id; i < end; i += group_size) { | ||
auto colNum = col_ptr[i]; | ||
auto vectorVal = input[colNum + input_j * band_shape[1]]; | ||
auto matrixVal = val_ptr[i]; | ||
sum += matrixVal * vectorVal; | ||
} | ||
sycl::atomic_ref<double, sycl::memory_order::relaxed, | ||
sycl::memory_scope::device> | ||
c_ref(output[idx + band_shape[0] * input_j]); | ||
c_ref += sum; | ||
}); | ||
}) | ||
.wait(); | ||
q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait(); | ||
} else { | ||
std::fill(elems, elems + band_shape[0] * width, 0); | ||
auto local_rows = local_data.rowptr_data(); | ||
auto row_i = 0; | ||
auto current_row_position = local_rows[1]; | ||
|
||
for (int i = 0; i < nnz_count; i++) { | ||
while (row_i + 1 < band_shape[0] && i >= current_row_position) { | ||
row_i++; | ||
current_row_position = local_rows[row_i + 1]; | ||
} | ||
for (auto j = 0; j < width; j++) { | ||
auto item_id = row_i + j * band_shape[0]; | ||
auto val_index = local_data.colind_data()[i] + j * band_shape[0]; | ||
auto value = b[val_index]; | ||
auto matrix_value = local_data.values_data()[i]; | ||
elems[item_id] += matrix_value * value; | ||
} | ||
} | ||
} | ||
} | ||
delete[] elems; | ||
sycl::free(val_ptr, q); | ||
sycl::free(col_ptr, q); | ||
sycl::free(row_ptr, q); | ||
sycl::free(input, q); | ||
sycl::free(output, q); | ||
} | ||
|
||
static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); } | ||
|
||
static void GemvRow_Reference(benchmark::State &state) { | ||
Gemv_Reference(state); | ||
} | ||
|
||
DR_BENCHMARK(GemvEq_Reference); | ||
|
||
DR_BENCHMARK(GemvRow_Reference); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
// SPDX-FileCopyrightText: Intel Corporation | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#include <dr/mp.hpp> | ||
#include <filesystem> | ||
#include <fmt/core.h> | ||
#include <fstream> | ||
#include <random> | ||
#include <sstream> | ||
|
||
namespace mp = dr::mp; | ||
|
||
MPI_Comm comm; | ||
int comm_rank; | ||
int comm_size; | ||
|
||
int main(int argc, char **argv) { | ||
|
||
MPI_Init(&argc, &argv); | ||
comm = MPI_COMM_WORLD; | ||
MPI_Comm_rank(comm, &comm_rank); | ||
MPI_Comm_size(comm, &comm_size); | ||
|
||
if (argc != 3 && argc != 5) { | ||
fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market " | ||
"file], or ./sparse_benchmark [test outcome dir] [number of " | ||
"rows] [number of columns] [density]\n"); | ||
return 1; | ||
} | ||
|
||
#ifdef SYCL_LANGUAGE_VERSION | ||
sycl::queue q = dr::mp::select_queue(); | ||
mp::init(q); | ||
#else | ||
mp::init(); | ||
#endif | ||
dr::views::csr_matrix_view<double, long> local_data; | ||
std::stringstream filenamestream; | ||
auto root = 0; | ||
auto computeSize = dr::mp::default_comm().size(); | ||
if (root == dr::mp::default_comm().rank()) { | ||
if (argc == 5) { | ||
fmt::print("started loading\n"); | ||
auto n = std::stoul(argv[2]); | ||
auto up = std::stoul(argv[3]); | ||
auto down = std::stoul(argv[4]); | ||
// local_data = dr::generate_random_csr<double, long>({n, m}, density, | ||
// 42); | ||
local_data = dr::generate_band_csr<double, long>(n, up, down); | ||
filenamestream << "mp_band_" << computeSize << "_" << n << "_" | ||
<< up + down << "_" << local_data.size(); | ||
fmt::print("finished loading\n"); | ||
} else { | ||
fmt::print("started loading\n"); | ||
std::string fname(argv[2]); | ||
std::filesystem::path p(argv[2]); | ||
local_data = dr::read_csr<double, long>(fname); | ||
filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" | ||
<< local_data.size(); | ||
fmt::print("finished loading\n"); | ||
} | ||
} | ||
std::string resname; | ||
mp::distributed_sparse_matrix< | ||
double, long, dr::mp::MpiBackend, | ||
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>> | ||
m_eq(local_data, root); | ||
mp::distributed_sparse_matrix< | ||
double, long, dr::mp::MpiBackend, | ||
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>> | ||
m_row(local_data, root); | ||
fmt::print("finished distribution\n"); | ||
std::vector<double> eq_duration; | ||
std::vector<double> row_duration; | ||
|
||
auto N = 10; | ||
std::vector<double> b; | ||
b.reserve(m_row.shape().second); | ||
std::vector<double> res(m_row.shape().first); | ||
for (auto i = 0; i < m_row.shape().second; i++) { | ||
b.push_back(i); | ||
} | ||
|
||
dr::mp::broadcasted_vector<double> allocated_b; | ||
allocated_b.broadcast_data(m_row.shape().second, 0, b, | ||
dr::mp::default_comm()); | ||
|
||
fmt::print("started initial gemv distribution\n"); | ||
gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work | ||
|
||
fmt::print("finished initial gemv distribution\n"); | ||
for (auto i = 0; i < N; i++) { | ||
auto begin = std::chrono::high_resolution_clock::now(); | ||
gemv(0, res, m_eq, allocated_b); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
double duration = std::chrono::duration<double>(end - begin).count() * 1000; | ||
eq_duration.push_back(duration); | ||
} | ||
|
||
gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work | ||
for (auto i = 0; i < N; i++) { | ||
auto begin = std::chrono::high_resolution_clock::now(); | ||
gemv(0, res, m_row, allocated_b); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
double duration = std::chrono::duration<double>(end - begin).count() * 1000; | ||
row_duration.push_back(duration); | ||
} | ||
|
||
if (root == dr::mp::default_comm().rank()) { | ||
std::string tmp; | ||
filenamestream >> tmp; | ||
std::filesystem::path p(argv[1]); | ||
p += tmp; | ||
p += ".csv"; | ||
std::ofstream write_stream(p.string()); | ||
write_stream << eq_duration.front(); | ||
for (auto i = 1; i < N; i++) { | ||
write_stream << "," << eq_duration[i]; | ||
} | ||
write_stream << "\n"; | ||
write_stream << row_duration.front(); | ||
for (auto i = 1; i < N; i++) { | ||
write_stream << "," << row_duration[i]; | ||
} | ||
write_stream << "\n"; | ||
} | ||
allocated_b.destroy_data(); | ||
mp::finalize(); | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.