Skip to content

Commit 8edaccf

Browse files
authored
Add sparse matrix to mp (#860)
1 parent 7cc447d commit 8edaccf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3520
-430
lines changed

benchmarks/gbench/mp/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ add_executable(
1515
../common/stream.cpp
1616
streammp.cpp
1717
rooted.cpp
18+
gemv.cpp
1819
stencil_1d.cpp
1920
stencil_2d.cpp
2021
chunk.cpp
@@ -41,7 +42,7 @@ endif()
4142
# mp-quick-bench is for development. By reducing the number of source files, it
4243
# builds much faster. Change the source files to match what you need to test. It
4344
# is OK to commit changes to the source file list.
44-
add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp)
45+
add_executable(mp-quick-bench mp-bench.cpp gemv.cpp)
4546

4647
foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench)
4748
target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP)

benchmarks/gbench/mp/gemv.cpp

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
// SPDX-FileCopyrightText: Intel Corporation
2+
//
3+
// SPDX-License-Identifier: BSD-3-Clause
4+
5+
#include "mpi.h"
6+
7+
#include "../common/dr_bench.hpp"
8+
#include "dr/mp.hpp"
9+
#include <filesystem>
10+
#include <fmt/core.h>
11+
#include <fstream>
12+
#include <random>
13+
#include <sstream>
14+
15+
namespace mp = dr::mp;
16+
17+
namespace {
18+
std::size_t getWidth() {
19+
return 8; // default_vector_size / 100000;
20+
}
21+
} // namespace
22+
static auto getMatrix() {
23+
// size below is useful when testing weak scaling with default vector size
24+
// using dr-bench it creates matrix which non-zero element count increases
25+
// linearly when we increase default_vector_size std::size_t n = std::max(1.,
26+
// std::sqrt(default_vector_size / 100000)) * 50000;
27+
28+
std::size_t density_scalar = 50;
29+
30+
std::size_t n =
31+
std::max(1., std::sqrt(default_vector_size * density_scalar / 2));
32+
33+
std::size_t up = n / density_scalar;
34+
std::size_t down = n / density_scalar;
35+
fmt::print("Generate matrix");
36+
auto tmp = dr::generate_band_csr<double, long>(n, up, down);
37+
fmt::print("generated!");
38+
return tmp;
39+
}
40+
41+
static void GemvEq_DR(benchmark::State &state) {
42+
auto local_data = getMatrix();
43+
44+
mp::distributed_sparse_matrix<
45+
double, long, dr::mp::MpiBackend,
46+
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
47+
m(local_data, 0);
48+
auto n = m.shape()[1];
49+
auto width = getWidth();
50+
std::vector<double> base_a(n * width);
51+
for (int j = 0; j < width; j++) {
52+
for (int i = 0; i < n; i++) {
53+
base_a[i + j * n] = i * j + 1;
54+
}
55+
}
56+
dr::mp::broadcasted_slim_matrix<double> allocated_a;
57+
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());
58+
59+
std::vector<double> res(m.shape().first * width);
60+
gemv(0, res, m, allocated_a);
61+
for (auto _ : state) {
62+
gemv(0, res, m, allocated_a);
63+
}
64+
}
65+
66+
DR_BENCHMARK(GemvEq_DR);
67+
68+
static void GemvRow_DR(benchmark::State &state) {
69+
auto local_data = getMatrix();
70+
71+
mp::distributed_sparse_matrix<
72+
double, long, dr::mp::MpiBackend,
73+
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
74+
m(local_data, 0);
75+
auto n = m.shape()[1];
76+
auto width = getWidth();
77+
std::vector<double> base_a(n * width);
78+
for (int j = 0; j < width; j++) {
79+
for (int i = 0; i < n; i++) {
80+
base_a[i + j * n] = i * j + 1;
81+
}
82+
}
83+
dr::mp::broadcasted_slim_matrix<double> allocated_a;
84+
allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());
85+
86+
std::vector<double> res(m.shape().first * width);
87+
gemv(0, res, m, allocated_a);
88+
for (auto _ : state) {
89+
gemv(0, res, m, allocated_a);
90+
}
91+
}
92+
93+
DR_BENCHMARK(GemvRow_DR);
94+
95+
static void Gemv_Reference(benchmark::State &state) {
96+
auto local_data = getMatrix();
97+
auto nnz_count = local_data.size();
98+
auto band_shape = local_data.shape();
99+
auto q = get_queue();
100+
auto policy = oneapi::dpl::execution::make_device_policy(q);
101+
auto val_ptr = sycl::malloc_device<double>(nnz_count, q);
102+
auto col_ptr = sycl::malloc_device<long>(nnz_count, q);
103+
auto row_ptr = sycl::malloc_device<long>((band_shape[0] + 1), q);
104+
std::vector<double> b;
105+
auto width = getWidth();
106+
for (auto i = 0; i < band_shape[1] * width; i++) {
107+
b.push_back(i);
108+
}
109+
double *elems = new double[band_shape[0] * width];
110+
auto input = sycl::malloc_device<double>(band_shape[1] * width, q);
111+
auto output = sycl::malloc_device<double>(band_shape[0] * width, q);
112+
q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double))
113+
.wait();
114+
q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait();
115+
q.memcpy(row_ptr, local_data.rowptr_data(),
116+
(band_shape[0] + 1) * sizeof(long))
117+
.wait();
118+
q.fill(output, 0, band_shape[0] * width);
119+
std::copy(policy, b.begin(), b.end(), input);
120+
121+
auto wg = 32;
122+
while (width * band_shape[0] * wg > INT_MAX) {
123+
wg /= 2;
124+
}
125+
assert(wg > 0);
126+
127+
for (auto _ : state) {
128+
if (dr::mp::use_sycl()) {
129+
dr::mp::sycl_queue()
130+
.submit([&](auto &&h) {
131+
h.parallel_for(
132+
sycl::nd_range<1>(width * band_shape[0] * wg, wg),
133+
[=](auto item) {
134+
auto input_j = item.get_group(0) / band_shape[0];
135+
auto idx = item.get_group(0) % band_shape[0];
136+
auto local_id = item.get_local_id();
137+
auto group_size = item.get_local_range(0);
138+
double sum = 0;
139+
auto start = row_ptr[idx];
140+
auto end = row_ptr[idx + 1];
141+
for (auto i = start + local_id; i < end; i += group_size) {
142+
auto colNum = col_ptr[i];
143+
auto vectorVal = input[colNum + input_j * band_shape[1]];
144+
auto matrixVal = val_ptr[i];
145+
sum += matrixVal * vectorVal;
146+
}
147+
sycl::atomic_ref<double, sycl::memory_order::relaxed,
148+
sycl::memory_scope::device>
149+
c_ref(output[idx + band_shape[0] * input_j]);
150+
c_ref += sum;
151+
});
152+
})
153+
.wait();
154+
q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait();
155+
} else {
156+
std::fill(elems, elems + band_shape[0] * width, 0);
157+
auto local_rows = local_data.rowptr_data();
158+
auto row_i = 0;
159+
auto current_row_position = local_rows[1];
160+
161+
for (int i = 0; i < nnz_count; i++) {
162+
while (row_i + 1 < band_shape[0] && i >= current_row_position) {
163+
row_i++;
164+
current_row_position = local_rows[row_i + 1];
165+
}
166+
for (auto j = 0; j < width; j++) {
167+
auto item_id = row_i + j * band_shape[0];
168+
auto val_index = local_data.colind_data()[i] + j * band_shape[0];
169+
auto value = b[val_index];
170+
auto matrix_value = local_data.values_data()[i];
171+
elems[item_id] += matrix_value * value;
172+
}
173+
}
174+
}
175+
}
176+
delete[] elems;
177+
sycl::free(val_ptr, q);
178+
sycl::free(col_ptr, q);
179+
sycl::free(row_ptr, q);
180+
sycl::free(input, q);
181+
sycl::free(output, q);
182+
}
183+
184+
static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); }
185+
186+
static void GemvRow_Reference(benchmark::State &state) {
187+
Gemv_Reference(state);
188+
}
189+
190+
DR_BENCHMARK(GemvEq_Reference);
191+
192+
DR_BENCHMARK(GemvRow_Reference);

examples/mp/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,23 @@ add_executable(vector-add vector-add.cpp)
1616
target_link_libraries(vector-add DR::mpi)
1717
add_mp_ctest(TEST_NAME vector-add NAME vector-add NPROC 2)
1818

19-
function(add_mp_example example_name)
19+
function(add_mp_example_no_test example_name)
2020
add_executable(${example_name} ${example_name}.cpp)
2121
target_link_libraries(${example_name} cxxopts DR::mpi)
22+
endfunction()
23+
24+
function(add_mp_example example_name)
25+
add_mp_example_no_test(${example_name})
2226
add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2)
2327
endfunction()
2428

2529
add_mp_example(stencil-1d)
2630
add_mp_example(stencil-1d-array)
2731
add_mp_example(stencil-1d-pointer)
2832
add_mp_example(hello_world)
33+
add_mp_example_no_test(sparse_matrix)
34+
add_mp_example_no_test(sparse_benchmark)
35+
add_mp_example_no_test(sparse_matrix_matrix_mul)
2936

3037
if(OpenMP_FOUND)
3138
add_executable(vector-add-ref vector-add-ref.cpp)

examples/mp/sparse_benchmark.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// SPDX-FileCopyrightText: Intel Corporation
2+
//
3+
// SPDX-License-Identifier: BSD-3-Clause
4+
5+
#include <dr/mp.hpp>
6+
#include <filesystem>
7+
#include <fmt/core.h>
8+
#include <fstream>
9+
#include <random>
10+
#include <sstream>
11+
12+
namespace mp = dr::mp;
13+
14+
MPI_Comm comm;
15+
int comm_rank;
16+
int comm_size;
17+
18+
int main(int argc, char **argv) {
19+
20+
MPI_Init(&argc, &argv);
21+
comm = MPI_COMM_WORLD;
22+
MPI_Comm_rank(comm, &comm_rank);
23+
MPI_Comm_size(comm, &comm_size);
24+
25+
if (argc != 3 && argc != 5) {
26+
fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market "
27+
"file], or ./sparse_benchmark [test outcome dir] [number of "
28+
"rows] [number of columns] [density]\n");
29+
return 1;
30+
}
31+
32+
#ifdef SYCL_LANGUAGE_VERSION
33+
sycl::queue q = dr::mp::select_queue();
34+
mp::init(q);
35+
#else
36+
mp::init();
37+
#endif
38+
dr::views::csr_matrix_view<double, long> local_data;
39+
std::stringstream filenamestream;
40+
auto root = 0;
41+
auto computeSize = dr::mp::default_comm().size();
42+
if (root == dr::mp::default_comm().rank()) {
43+
if (argc == 5) {
44+
fmt::print("started loading\n");
45+
auto n = std::stoul(argv[2]);
46+
auto up = std::stoul(argv[3]);
47+
auto down = std::stoul(argv[4]);
48+
// local_data = dr::generate_random_csr<double, long>({n, m}, density,
49+
// 42);
50+
local_data = dr::generate_band_csr<double, long>(n, up, down);
51+
filenamestream << "mp_band_" << computeSize << "_" << n << "_"
52+
<< up + down << "_" << local_data.size();
53+
fmt::print("finished loading\n");
54+
} else {
55+
fmt::print("started loading\n");
56+
std::string fname(argv[2]);
57+
std::filesystem::path p(argv[2]);
58+
local_data = dr::read_csr<double, long>(fname);
59+
filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_"
60+
<< local_data.size();
61+
fmt::print("finished loading\n");
62+
}
63+
}
64+
std::string resname;
65+
mp::distributed_sparse_matrix<
66+
double, long, dr::mp::MpiBackend,
67+
dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
68+
m_eq(local_data, root);
69+
mp::distributed_sparse_matrix<
70+
double, long, dr::mp::MpiBackend,
71+
dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
72+
m_row(local_data, root);
73+
fmt::print("finished distribution\n");
74+
std::vector<double> eq_duration;
75+
std::vector<double> row_duration;
76+
77+
auto N = 10;
78+
std::vector<double> b;
79+
b.reserve(m_row.shape().second);
80+
std::vector<double> res(m_row.shape().first);
81+
for (auto i = 0; i < m_row.shape().second; i++) {
82+
b.push_back(i);
83+
}
84+
85+
dr::mp::broadcasted_vector<double> allocated_b;
86+
allocated_b.broadcast_data(m_row.shape().second, 0, b,
87+
dr::mp::default_comm());
88+
89+
fmt::print("started initial gemv distribution\n");
90+
gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work
91+
92+
fmt::print("finished initial gemv distribution\n");
93+
for (auto i = 0; i < N; i++) {
94+
auto begin = std::chrono::high_resolution_clock::now();
95+
gemv(0, res, m_eq, allocated_b);
96+
auto end = std::chrono::high_resolution_clock::now();
97+
double duration = std::chrono::duration<double>(end - begin).count() * 1000;
98+
eq_duration.push_back(duration);
99+
}
100+
101+
gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work
102+
for (auto i = 0; i < N; i++) {
103+
auto begin = std::chrono::high_resolution_clock::now();
104+
gemv(0, res, m_row, allocated_b);
105+
auto end = std::chrono::high_resolution_clock::now();
106+
double duration = std::chrono::duration<double>(end - begin).count() * 1000;
107+
row_duration.push_back(duration);
108+
}
109+
110+
if (root == dr::mp::default_comm().rank()) {
111+
std::string tmp;
112+
filenamestream >> tmp;
113+
std::filesystem::path p(argv[1]);
114+
p += tmp;
115+
p += ".csv";
116+
std::ofstream write_stream(p.string());
117+
write_stream << eq_duration.front();
118+
for (auto i = 1; i < N; i++) {
119+
write_stream << "," << eq_duration[i];
120+
}
121+
write_stream << "\n";
122+
write_stream << row_duration.front();
123+
for (auto i = 1; i < N; i++) {
124+
write_stream << "," << row_duration[i];
125+
}
126+
write_stream << "\n";
127+
}
128+
allocated_b.destroy_data();
129+
mp::finalize();
130+
}

0 commit comments

Comments
 (0)