diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index c3ae2f682c..adbf32e34e 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -15,6 +15,7 @@ add_executable( ../common/stream.cpp streammp.cpp rooted.cpp + gemv.cpp stencil_1d.cpp stencil_2d.cpp chunk.cpp @@ -41,7 +42,7 @@ endif() # mp-quick-bench is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. -add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) +add_executable(mp-quick-bench mp-bench.cpp gemv.cpp) foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) diff --git a/benchmarks/gbench/mp/fft3d.cpp b/benchmarks/gbench/mp/fft3d.cpp index 5e30a57b44..23dcbbdae3 100644 --- a/benchmarks/gbench/mp/fft3d.cpp +++ b/benchmarks/gbench/mp/fft3d.cpp @@ -5,7 +5,11 @@ #include "cxxopts.hpp" #include "fmt/core.h" #include "mpi.h" +#if (__INTEL_LLVM_COMPILER >= 20250000) #include "oneapi/mkl/dft.hpp" +#else +#include "oneapi/mkl/dfti.hpp" +#endif #include #include "dr/mp.hpp" diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp new file mode 100644 index 0000000000..44216cbcc5 --- /dev/null +++ b/benchmarks/gbench/mp/gemv.cpp @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "mpi.h" + +#include "../common/dr_bench.hpp" +#include "dr/mp.hpp" +#include +#include +#include +#include +#include + +namespace mp = dr::mp; + +namespace { +std::size_t getWidth() { + return 8; // default_vector_size / 100000; +} +} // namespace +static auto getMatrix() { + // size below is useful when testing weak scaling with default vector size + // using dr-bench it creates matrix which non-zero element count increases + // linearly when we increase default_vector_size std::size_t n = std::max(1., + // std::sqrt(default_vector_size / 100000)) * 50000; + + std::size_t density_scalar = 50; + + std::size_t n = + std::max(1., std::sqrt(default_vector_size * density_scalar / 2)); + + std::size_t up = n / density_scalar; + std::size_t down = n / density_scalar; + fmt::print("Generate matrix"); + auto tmp = dr::generate_band_csr(n, up, down); + fmt::print("generated!"); + return tmp; +} + +static void GemvEq_DR(benchmark::State &state) { + auto local_data = getMatrix(); + + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, 0); + auto n = m.shape()[1]; + auto width = getWidth(); + std::vector base_a(n * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i * j + 1; + } + } + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); + + std::vector res(m.shape().first * width); + gemv(0, res, m, allocated_a); + for (auto _ : state) { + gemv(0, res, m, allocated_a); + } +} + +DR_BENCHMARK(GemvEq_DR); + +static void GemvRow_DR(benchmark::State &state) { + auto local_data = getMatrix(); + + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m(local_data, 0); + auto n = m.shape()[1]; + auto width = getWidth(); + std::vector base_a(n * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i * j + 1; + } + } + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); + + std::vector res(m.shape().first * width); + gemv(0, res, m, allocated_a); + for (auto _ : state) { + gemv(0, res, m, allocated_a); + } +} + +DR_BENCHMARK(GemvRow_DR); + +static void Gemv_Reference(benchmark::State &state) { + auto local_data = getMatrix(); + auto nnz_count = local_data.size(); + auto band_shape = local_data.shape(); + auto q = get_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(q); + auto val_ptr = sycl::malloc_device(nnz_count, q); + auto col_ptr = sycl::malloc_device(nnz_count, q); + auto row_ptr = sycl::malloc_device((band_shape[0] + 1), q); + std::vector b; + auto width = getWidth(); + for (auto i = 0; i < band_shape[1] * width; i++) { + b.push_back(i); + } + double *elems = new double[band_shape[0] * width]; + auto input = sycl::malloc_device(band_shape[1] * width, q); + auto output = sycl::malloc_device(band_shape[0] * width, q); + q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)) + .wait(); + q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); + q.memcpy(row_ptr, local_data.rowptr_data(), + (band_shape[0] + 1) * sizeof(long)) + .wait(); + q.fill(output, 0, band_shape[0] * width); + std::copy(policy, b.begin(), b.end(), input); + + auto wg = 32; + while (width * band_shape[0] * wg > INT_MAX) { + wg /= 2; + } + assert(wg > 0); + + for (auto _ : state) { + if (dr::mp::use_sycl()) { + dr::mp::sycl_queue() + .submit([&](auto &&h) { + h.parallel_for( + sycl::nd_range<1>(width * band_shape[0] * wg, wg), + [=](auto item) { + auto input_j = item.get_group(0) / band_shape[0]; + auto idx = item.get_group(0) % band_shape[0]; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); + double sum = 0; + auto start = row_ptr[idx]; + auto end = row_ptr[idx + 1]; + for (auto i = start + local_id; i < end; i += group_size) { + auto colNum = col_ptr[i]; + auto vectorVal = input[colNum + input_j * band_shape[1]]; + auto matrixVal = val_ptr[i]; + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(output[idx + band_shape[0] * input_j]); + c_ref += sum; + }); + }) + .wait(); + q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait(); + } else { + std::fill(elems, elems + band_shape[0] * width, 0); + auto local_rows = local_data.rowptr_data(); + auto row_i = 0; + auto current_row_position = local_rows[1]; + + for (int i = 0; i < nnz_count; i++) { + while (row_i + 1 < band_shape[0] && i >= current_row_position) { + row_i++; + current_row_position = local_rows[row_i + 1]; + } + for (auto j = 0; j < width; j++) { + auto item_id = row_i + j * band_shape[0]; + auto val_index = local_data.colind_data()[i] + j * band_shape[0]; + auto value = b[val_index]; + auto matrix_value = local_data.values_data()[i]; + elems[item_id] += matrix_value * value; + } + } + } + } + delete[] elems; + sycl::free(val_ptr, q); + sycl::free(col_ptr, q); + sycl::free(row_ptr, q); + sycl::free(input, q); + sycl::free(output, q); +} + +static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); } + +static void GemvRow_Reference(benchmark::State &state) { + Gemv_Reference(state); +} + +DR_BENCHMARK(GemvEq_Reference); + +DR_BENCHMARK(GemvRow_Reference); diff --git a/benchmarks/gbench/sp/fft3d.cpp b/benchmarks/gbench/sp/fft3d.cpp index 19d4f3aee4..cdf060c88c 100644 --- a/benchmarks/gbench/sp/fft3d.cpp +++ b/benchmarks/gbench/sp/fft3d.cpp @@ -3,7 +3,11 @@ // SPDX-License-Identifier: BSD-3-Clause #include "cxxopts.hpp" +#if (__INTEL_LLVM_COMPILER >= 20250000) #include "oneapi/mkl/dft.hpp" +#else +#include "oneapi/mkl/dfti.hpp" +#endif #include #include #include diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 41cac86ecc..a8180838b3 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -16,9 +16,13 @@ add_executable(vector-add vector-add.cpp) target_link_libraries(vector-add DR::mpi) add_mp_ctest(TEST_NAME vector-add NAME vector-add NPROC 2) -function(add_mp_example example_name) +function(add_mp_example_no_test example_name) add_executable(${example_name} ${example_name}.cpp) target_link_libraries(${example_name} cxxopts DR::mpi) +endfunction() + +function(add_mp_example example_name) + add_mp_example_no_test(${example_name}) add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2) endfunction() @@ -26,6 +30,9 @@ add_mp_example(stencil-1d) add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) +add_mp_example_no_test(sparse_matrix) +add_mp_example_no_test(sparse_benchmark) +add_mp_example_no_test(sparse_matrix_matrix_mul) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/sparse_benchmark.cpp b/examples/mp/sparse_benchmark.cpp new file mode 100644 index 0000000000..b53c0ee477 --- /dev/null +++ b/examples/mp/sparse_benchmark.cpp @@ -0,0 +1,130 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include +#include +#include +#include + +namespace mp = dr::mp; + +MPI_Comm comm; +int comm_rank; +int comm_size; + +int main(int argc, char **argv) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + if (argc != 3 && argc != 5) { + fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market " + "file], or ./sparse_benchmark [test outcome dir] [number of " + "rows] [number of columns] [density]\n"); + return 1; + } + +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + mp::init(q); +#else + mp::init(); +#endif + dr::views::csr_matrix_view local_data; + std::stringstream filenamestream; + auto root = 0; + auto computeSize = dr::mp::default_comm().size(); + if (root == dr::mp::default_comm().rank()) { + if (argc == 5) { + fmt::print("started loading\n"); + auto n = std::stoul(argv[2]); + auto up = std::stoul(argv[3]); + auto down = std::stoul(argv[4]); + // local_data = dr::generate_random_csr({n, m}, density, + // 42); + local_data = dr::generate_band_csr(n, up, down); + filenamestream << "mp_band_" << computeSize << "_" << n << "_" + << up + down << "_" << local_data.size(); + fmt::print("finished loading\n"); + } else { + fmt::print("started loading\n"); + std::string fname(argv[2]); + std::filesystem::path p(argv[2]); + local_data = dr::read_csr(fname); + filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" + << local_data.size(); + fmt::print("finished loading\n"); + } + } + std::string resname; + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m_eq(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + fmt::print("finished distribution\n"); + std::vector eq_duration; + std::vector row_duration; + + auto N = 10; + std::vector b; + b.reserve(m_row.shape().second); + std::vector res(m_row.shape().first); + for (auto i = 0; i < m_row.shape().second; i++) { + b.push_back(i); + } + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m_row.shape().second, 0, b, + dr::mp::default_comm()); + + fmt::print("started initial gemv distribution\n"); + gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work + + fmt::print("finished initial gemv distribution\n"); + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_eq, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + eq_duration.push_back(duration); + } + + gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_row, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + row_duration.push_back(duration); + } + + if (root == dr::mp::default_comm().rank()) { + std::string tmp; + filenamestream >> tmp; + std::filesystem::path p(argv[1]); + p += tmp; + p += ".csv"; + std::ofstream write_stream(p.string()); + write_stream << eq_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << eq_duration[i]; + } + write_stream << "\n"; + write_stream << row_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << row_duration[i]; + } + write_stream << "\n"; + } + allocated_b.destroy_data(); + mp::finalize(); +} diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp new file mode 100644 index 0000000000..ae1511280c --- /dev/null +++ b/examples/mp/sparse_matrix.cpp @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +namespace mp = dr::mp; + +int main(int argc, char **argv) { + + if (argc != 2) { + fmt::print("usage: ./sparse_matrix [matrix market file]\n"); + return 1; + } + + std::string fname(argv[1]); +#ifdef SYCL_LANGUAGE_VERSION + mp::init(sycl::default_selector_v); +#else + mp::init(); +#endif + + dr::views::csr_matrix_view local_data; + auto root = 0; + if (root == dr::mp::default_comm().rank()) { + local_data = dr::read_csr(fname); + } + { + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + std::vector res(m.shape().first); + std::vector res_row(m.shape().first); + std::vector a(m.shape().second); + for (int i = 0; i < a.size(); i++) { + a[i] = i; + } + + dr::mp::broadcasted_vector allocated_a; + allocated_a.broadcast_data(m_row.shape().second, 0, a, + dr::mp::default_comm()); + m.fence(); + double total_time = 0; + auto N = 1; + gemv(0, res, m, allocated_a); // it is here to prepare sycl for work + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("eq canary {}\n", duration * 1000); + } + } + if (root == dr::mp::default_comm().rank()) { + fmt::print("eq gemv time total {}\n", total_time * 1000 / N); + } + m.fence(); + total_time = 0; + gemv(0, res_row, m_row, allocated_a); + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res_row, m_row, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("row canary {}\n", duration * 1000); + } + } + + if (root == dr::mp::default_comm().rank()) { + fmt::print("row gemv time total {}\n", total_time * 1000 / N); + } + m_row.fence(); + + std::vector ref(m.shape().first); + if (dr::mp::default_comm().rank() == 0) { + for (auto a : local_data) { + auto [index, val] = a; + auto [m, n] = index; + ref[m] += n * val; + } + for (int i = 0; i < m.shape().first; i++) { + if (res[i] != ref[i]) { + fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]); + } + } + for (int i = 0; i < m.shape().first; i++) { + if (res_row[i] != ref[i]) { + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], + ref[i]); + } + } + } + allocated_a.destroy_data(); + } + + if (root == dr::mp::default_comm().rank()) { + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } + mp::finalize(); + + return 0; +} diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp new file mode 100644 index 0000000000..4fd7c139c8 --- /dev/null +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -0,0 +1,118 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +namespace mp = dr::mp; + +int main(int argc, char **argv) { + + if (argc != 2) { + fmt::print("usage: ./sparse_matrix [matrix market file]\n"); + return 1; + } + + std::string fname(argv[1]); +#ifdef SYCL_LANGUAGE_VERSION + mp::init(sycl::default_selector_v); +#else + mp::init(); +#endif + + dr::views::csr_matrix_view local_data; + auto root = 0; + if (root == dr::mp::default_comm().rank()) { + local_data = dr::read_csr(fname); + } + { + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + fmt::print("{}\n", m.size()); + + auto width = 3; + std::vector res(m.shape().first * width); + std::vector res_row(m.shape().first * width); + std::vector base_a(m.shape().second * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < m.shape().second; i++) { + base_a[i + j * m.shape().second] = i * j + 1; + } + } + + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(m_row.shape().second, width, 0, base_a, + dr::mp::default_comm()); + m.fence(); + double total_time = 0; + auto N = 1; + gemv(0, res, m, allocated_a); // it is here to prepare sycl for work + for (int i = 0; i < 100; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (root == dr::mp::default_comm().rank()) { + fmt::print("eq canary {}\n\n", duration * 1000); + } + } + m.fence(); + total_time = 0; + gemv(0, res_row, m_row, allocated_a); + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res_row, m_row, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("row canary {}\n", duration * 1000); + } + } + + if (root == dr::mp::default_comm().rank()) { + fmt::print("row gemv time total {}\n", total_time * 1000 / N); + } + m_row.fence(); + + std::vector ref(m.shape().first * width); + auto res_col_len = m.shape().first; + auto in_len = m.shape().second; + if (dr::mp::default_comm().rank() == 0) { + for (auto a : local_data) { + auto [index, val] = a; + auto [m, n] = index; + for (int i = 0; i < width; i++) { + ref[m + i * res_col_len] += base_a[n + i * in_len] * val; + } + } + for (int i = 0; i < m.shape().first * width; i++) { + if (res[i] != ref[i]) { + fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]); + } + } + for (int i = 0; i < m.shape().first * width; i++) { + if (res_row[i] != ref[i]) { + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], + ref[i]); + } + } + } + allocated_a.destroy_data(); + } + + if (root == dr::mp::default_comm().rank()) { + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } + mp::finalize(); + + return 0; +} diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 331253ab63..d6fa99ffb4 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -66,10 +66,15 @@ class communicator { MPI_Gather_c(src, count, MPI_BYTE, dst, count, MPI_BYTE, root, mpi_comm_); } + template + void gather(const T *src, T *dst, std::size_t count, std::size_t root) const { + gather((void *)src, (void *)dst, count * sizeof(T), root); + } + template void gather(const T &src, std::span dst, std::size_t root) const { assert(rng::size(dst) >= size_); - gather(&src, rng::data(dst), sizeof(T), root); + gather(&src, rng::data(dst), 1, root); } template @@ -105,10 +110,10 @@ class communicator { i_all_gather(&src, rng::data(dst), 1, req); } - void gatherv(const void *src, int *counts, int *offsets, void *dst, + void gatherv(const void *src, MPI_Count *counts, MPI_Aint *offsets, void *dst, std::size_t root) const { - MPI_Gatherv(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, - root, mpi_comm_); + MPI_Gatherv_c(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, + root, mpi_comm_); } // pointer with explicit tag @@ -167,6 +172,13 @@ class communicator { irecv(data, src_rank, 0, request); } + void wait(MPI_Request request) const { + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + void waitall(std::size_t count, MPI_Request *requests) const { + MPI_Waitall(count, requests, MPI_STATUS_IGNORE); + } + template void alltoall(const R &sendr, R &recvr, std::size_t count) { alltoall(rng::data(sendr), rng::data(recvr), count); diff --git a/include/dr/sp/util/coo_matrix.hpp b/include/dr/detail/coo_matrix.hpp similarity index 91% rename from include/dr/sp/util/coo_matrix.hpp rename to include/dr/detail/coo_matrix.hpp index 0e8b871504..1cf8e4bc83 100644 --- a/include/dr/sp/util/coo_matrix.hpp +++ b/include/dr/detail/coo_matrix.hpp @@ -4,18 +4,18 @@ #pragma once -#include +#include #include #include -namespace dr::sp { +namespace dr { namespace __detail { template > class coo_matrix { public: - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_type = T; using index_type = I; using size_type = std::size_t; @@ -33,8 +33,8 @@ class coo_matrix { using iterator = typename backend_type::iterator; using const_iterator = typename backend_type::const_iterator; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref, I>; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref, I>; using scalar_reference = T &; @@ -110,14 +110,14 @@ class coo_matrix { } iterator find(key_type key) noexcept { - return std::find_if(begin(), end(), [&](auto &&v) { + return std::ranges::find_if(begin(), end(), [&](auto &&v) { auto &&[i, v_] = v; return i == key; }); } const_iterator find(key_type key) const noexcept { - return std::find_if(begin(), end(), [&](auto &&v) { + return std::ranges::find_if(begin(), end(), [&](auto &&v) { auto &&[i, v_] = v; return i == key; }); @@ -167,4 +167,4 @@ class coo_matrix { } // namespace __detail -} // namespace dr::sp +} // namespace dr diff --git a/include/dr/detail/csr_matrix_base.hpp b/include/dr/detail/csr_matrix_base.hpp new file mode 100644 index 0000000000..0207a95428 --- /dev/null +++ b/include/dr/detail/csr_matrix_base.hpp @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include + +namespace dr { + +namespace __detail { + +template > +class csr_matrix_base { +public: + using value_type = std::pair; + using scalar_type = T; + using index_type = I; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + using allocator_type = Allocator; + + using key_type = dr::index; + using map_type = T; + + using backend_allocator_type = typename std::allocator_traits< + allocator_type>::template rebind_alloc; + using aggregator_allocator_type = typename std::allocator_traits< + allocator_type>::template rebind_alloc>; + using row_type = std::vector; + using backend_type = std::vector; + + using iterator = typename backend_type::iterator; + using const_iterator = typename backend_type::const_iterator; + + csr_matrix_base(dr::index shape, std::size_t nnz) : shape_(shape) { + auto average_size = nnz / shape.first / 2; + for (std::size_t i = 0; i < shape.first; i++) { + tuples_.push_back(row_type()); + tuples_.back().reserve(average_size); + } + } + + dr::index shape() const noexcept { return shape_; } + + size_type size() const noexcept { return size_; } + + iterator begin() noexcept { return tuples_.begin(); } + + const_iterator begin() const noexcept { return tuples_.begin(); } + + iterator end() noexcept { return tuples_.end(); } + + const_iterator end() const noexcept { return tuples_.end(); } + + template void push_back(InputIt first, InputIt last) { + for (auto iter = first; iter != last; ++iter) { + push_back(*iter); + } + } + + void push_back(index_type row, const value_type &value) { + tuples_[row].push_back(value); + size_++; + } + + void sort() { + auto comparator = [](auto &one, auto &two) { + return one.second < two.second; + }; + for (auto &elem : tuples_) { + std::sort(elem.begin(), elem.end(), comparator); + } + } + + csr_matrix_base() = default; + ~csr_matrix_base() = default; + csr_matrix_base(const csr_matrix_base &) = default; + csr_matrix_base(csr_matrix_base &&) = default; + csr_matrix_base &operator=(const csr_matrix_base &) = default; + csr_matrix_base &operator=(csr_matrix_base &&) = default; + +private: + std::size_t size_ = 0; + dr::index shape_; + backend_type tuples_; +}; + +} // namespace __detail + +} // namespace dr diff --git a/include/dr/detail/generate_csr.hpp b/include/dr/detail/generate_csr.hpp new file mode 100644 index 0000000000..c8ef7202a2 --- /dev/null +++ b/include/dr/detail/generate_csr.hpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include +#include + +namespace dr { + +namespace { + +template struct uniform_distribution { + using type = std::uniform_int_distribution; +}; + +template struct uniform_distribution { + using type = std::uniform_real_distribution; +}; + +template +using uniform_distribution_t = typename uniform_distribution::type; + +struct pair_hash { + template + inline std::size_t operator()(const std::pair &v) const { + return v.first * 31 + v.second; + } +}; + +} // namespace + +// it returns matrix view of randomly generated matrix +// the memory is owned by the view, so it needs to be released using +// destroy_csr_matrix_view +template +auto generate_random_csr(dr::index shape, double density = 0.01, + unsigned int seed = 0) { + + assert(density >= 0.0 && density < 1.0); + + std::unordered_set, pair_hash> tuples{}; + std::vector, T>> entries; + std::size_t nnz = density * shape[0] * shape[1]; + entries.reserve(nnz); + + std::mt19937 gen(seed); + std::uniform_int_distribution row(0, shape[0] - 1); + std::uniform_int_distribution column(0, shape[1] - 1); + + uniform_distribution_t value_gen(0, 1); + + while (tuples.size() < nnz) { + auto i = row(gen); + auto j = column(gen); + if (tuples.find({i, j}) == tuples.end()) { + T value = value_gen(gen); + tuples.insert({i, j}); + entries.push_back({{i, j}, value}); + } + } + T *values = new T[nnz]; + I *rowptr = new I[shape[0] + 1]; + I *colind = new I[nnz]; + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + std::sort(entries.begin(), entries.end()); + for (auto iter = entries.begin(); iter != entries.end(); ++iter) { + auto &&[index, value] = *iter; + auto &&[i, j] = index; + + values[c] = value; + colind[c] = j; + + while (r < i) { + if (r + 1 > shape[0]) { + // TODO: exception? + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + rowptr[r + 1] = c; + r++; + } + c++; + + if (c > nnz) { + // TODO: exception? + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + } + + for (; r < shape[0]; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0); +} + +// it returns matrix view of band matrix +// the memory is owned by the view, so it needs to be released using +// destroy_csr_matrix_view +template +auto generate_band_csr(I size, std::size_t up_band = 3, + std::size_t down_band = 3) { + std::size_t nnz = (1 + up_band + down_band) * size - + (up_band * (up_band + 1) / 2) - + (down_band * (down_band + 1) / 2); + + T *values = new T[nnz]; + I *rowptr = new I[size + 1]; + I *colind = new I[nnz]; + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto i = 0; i < size; i++) { + for (auto j = std::max(static_cast(i) - + static_cast(down_band), + static_cast(0)); + j < i; j++) { + values[c] = 1; + colind[c] = static_cast(j); + c++; + } + values[c] = 1; + colind[c] = i; + c++; + for (auto j = i + 1; j <= i + up_band; j++) { + if (j >= size) { + continue; + } + values[c] = 1; + colind[c] = j; + c++; + } + rowptr[r + 1] = c; + r++; + } + + for (; r < size; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, {size, size}, + nnz, 0); +} + +} // namespace dr diff --git a/include/dr/sp/containers/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp similarity index 83% rename from include/dr/sp/containers/matrix_entry.hpp rename to include/dr/detail/matrix_entry.hpp index b5d2885499..2875b35d9b 100644 --- a/include/dr/sp/containers/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -4,14 +4,19 @@ #pragma once +#include #include #include #include #include -namespace dr::sp { - +namespace dr { +template +concept getable = requires(T x) { + std::get<0>(x); + std::get<1>(x); +}; template class matrix_entry { public: using index_type = I; @@ -28,6 +33,7 @@ template class matrix_entry { : index_(index), value_(std::forward(value)) {} template + requires(getable) matrix_entry(Entry &&entry) : index_(std::get<0>(entry)), value_(std::get<1>(entry)) {} @@ -62,14 +68,11 @@ template class matrix_entry { return matrix_entry, U>(index_, value_); } - bool operator<(const matrix_entry &other) const noexcept { - if (index()[0] < other.index()[0]) { - return true; - } else if (index()[0] == other.index()[0] && - index()[1] < other.index()[1]) { - return true; + inline bool operator<(const matrix_entry &other) const noexcept { + if (index_.first != other.index_.first) { + return index_.first < other.index_.first; } - return false; + return index_.second < other.index_.second; } matrix_entry() = default; @@ -85,28 +88,28 @@ template class matrix_entry { map_type value_; }; -} // namespace dr::sp +} // namespace dr namespace std { template requires(!std::is_const_v) -void swap(dr::sp::matrix_entry a, dr::sp::matrix_entry b) { - dr::sp::matrix_entry other = a; +void swap(dr::matrix_entry a, dr::matrix_entry b) { + dr::matrix_entry other = a; a = b; b = other; } template -struct tuple_element> +struct tuple_element> : tuple_element, T>> {}; template -struct tuple_size> : integral_constant {}; +struct tuple_size> : integral_constant {}; } // namespace std -namespace dr::sp { +namespace dr { template class matrix_ref { @@ -119,7 +122,7 @@ class matrix_ref { using scalar_reference = TRef; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; matrix_ref(dr::index index, scalar_reference value) : index_(index), value_(value) {} @@ -183,28 +186,28 @@ class matrix_ref { scalar_reference value_; }; -} // namespace dr::sp +} // namespace dr namespace std { template requires(!std::is_const_v) -void swap(dr::sp::matrix_ref a, dr::sp::matrix_ref b) { - dr::sp::matrix_entry other = a; +void swap(dr::matrix_ref a, dr::matrix_ref b) { + dr::matrix_entry other = a; a = b; b = other; } template -struct tuple_element> +struct tuple_element> : tuple_element, TRef>> {}; template -struct tuple_size> +struct tuple_size> : integral_constant {}; template -inline decltype(auto) get(dr::sp::matrix_ref ref) +inline decltype(auto) get(dr::matrix_ref ref) requires(Index <= 1) { if constexpr (Index == 0) { @@ -216,7 +219,7 @@ inline decltype(auto) get(dr::sp::matrix_ref ref) } template -inline decltype(auto) get(dr::sp::matrix_entry entry) +inline decltype(auto) get(dr::matrix_entry entry) requires(Index <= 1) { if constexpr (Index == 0) { diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp new file mode 100644 index 0000000000..1a845008b1 --- /dev/null +++ b/include/dr/detail/matrix_io.hpp @@ -0,0 +1,257 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dr { + +namespace __detail { + +// Preconditions: +// 1) `tuples` sorted by row, column +// 2) `tuples` has shape `shape` +// 3) `tuples` has `nnz` elements +template +auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, + Allocator &&allocator) { + auto &&[index, v] = *tuples.begin(); + auto &&[i, j] = index; + + using T = std::remove_reference_t; + using I = std::remove_reference_t; + + typename std::allocator_traits::template rebind_alloc + i_allocator(allocator); + + T *values = allocator.allocate(nnz); + I *rowptr = i_allocator.allocate(shape[0] + 1); + I *colind = i_allocator.allocate(nnz); + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { + auto &&[index, value] = *iter; + + auto &&[i, j] = index; + + values[c] = value; + colind[c] = j; + + while (r < i) { + assert(r + 1 <= shape[0]); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + rowptr[r + 1] = c; + r++; + } + c++; + + assert(c <= nnz); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + + for (; r < shape[0]; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, + dr::index(shape[0], shape[1]), nnz, 0); +} + +template +auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape, + std::size_t nnz, Allocator &&allocator) { + auto &&[v, j] = *csr_matrix.begin()->begin(); + + using T = std::remove_reference_t; + using I = std::remove_reference_t; + + typename std::allocator_traits::template rebind_alloc + i_allocator(allocator); + + T *values = allocator.allocate(nnz); + I *rowptr = i_allocator.allocate(shape[0] + 1); + I *colind = i_allocator.allocate(nnz); + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto iter = csr_matrix.begin(); iter != csr_matrix.end(); ++iter) { + for (auto iter2 = iter->begin(); iter2 != iter->end(); ++iter2) { + auto &&[value, j] = *iter2; + + values[c] = value; + colind[c] = j; + c++; + } + assert(r + 1 <= shape[0]); + rowptr[r + 1] = c; + r++; + + assert(c <= nnz); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + + for (; r < shape[0]; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, + dr::index(shape[0], shape[1]), nnz, 0); +} + +/// Read in the Matrix Market file at location `file_path` and a return +/// a coo_matrix data structure with its contents. +template +inline csr_matrix_base read_csr_matrix_base(std::string file_path, + bool one_indexed = true) { + using size_type = std::size_t; + + std::ifstream f; + + f.open(file_path.c_str()); + + if (!f.is_open()) { + // TODO better choice of exception. + throw std::runtime_error("mmread: cannot open " + file_path); + } + + std::string buf; + + // Make sure the file is matrix market matrix, coordinate, and check whether + // it is symmetric. If the matrix is symmetric, non-diagonal elements will + // be inserted in both (i, j) and (j, i). Error out if skew-symmetric or + // Hermitian. + std::getline(f, buf); + std::istringstream ss(buf); + std::string item; + ss >> item; + if (item != "%%MatrixMarket") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + ss >> item; + if (item != "matrix") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + ss >> item; + if (item != "coordinate") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + bool pattern; + ss >> item; + if (item == "pattern") { + pattern = true; + } else { + pattern = false; + } + // TODO: do something with real vs. integer vs. pattern? + ss >> item; + bool symmetric; + if (item == "general") { + symmetric = false; + } else if (item == "symmetric") { + symmetric = true; + } else { + throw std::runtime_error(file_path + " has an unsupported matrix type"); + } + + bool outOfComments = false; + while (!outOfComments) { + std::getline(f, buf); + + if (buf[0] != '%') { + outOfComments = true; + } + } + + I m, n, nnz; + // std::istringstream ss(buf); + ss.clear(); + ss.str(buf); + ss >> m >> n >> nnz; + + // NOTE for symmetric matrices: `nnz` holds the number of stored values in + // the matrix market file, while `matrix.nnz_` will hold the total number of + // stored values (including "mirrored" symmetric values). + csr_matrix_base matrix({m, n}, nnz); + + size_type c = 0; + while (std::getline(f, buf)) { + I i, j; + T v; + std::istringstream ss(buf); + if (!pattern) { + ss >> i >> j >> v; + } else { + ss >> i >> j; + v = T(1); + } + if (one_indexed) { + i--; + j--; + } + + if (i >= m || j >= n) { + throw std::runtime_error( + "read_MatrixMarket: file has nonzero out of bounds."); + } + + matrix.push_back(i, {v, j}); + + if (symmetric && i != j) { + matrix.push_back(j, {v, i}); + } + + c++; + if (c > nnz) { + throw std::runtime_error("read_MatrixMarket: error reading Matrix Market " + "file, file has more nonzeros than reported."); + } + } + + matrix.sort(); + f.close(); + + return matrix; +} + +template +void destroy_csr_matrix_view(dr::views::csr_matrix_view view, + Allocator &&alloc) { + alloc.deallocate(view.values_data(), view.size()); + typename std::allocator_traits::template rebind_alloc i_alloc( + alloc); + i_alloc.deallocate(view.colind_data(), view.size()); + i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1); +} + +} // namespace __detail + +template +auto read_csr(std::string file_path, bool one_indexed = true) { + auto m = __detail::read_csr_matrix_base(file_path, one_indexed); + auto shape = m.shape(); + auto nnz = m.size(); + auto t = + __detail::convert_csr_base_to_csr(m, shape, nnz, std::allocator{}); + + return t; +} +} // namespace dr diff --git a/include/dr/detail/multiply_view.hpp b/include/dr/detail/multiply_view.hpp new file mode 100644 index 0000000000..28d124ba50 --- /dev/null +++ b/include/dr/detail/multiply_view.hpp @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +#include +#include + +namespace dr::__detail { + +template class multiply_iterator { +public: + using value_type = std::iter_value_t; + using difference_type = long long; + using iterator = multiply_iterator; + using reference = value_type; + + using pointer = iterator; + + using iterator_category = std::random_access_iterator_tag; + + multiply_iterator(Iter iter, std::size_t len, long long pos) noexcept + : iter_(iter), len_(len), pos_(pos) {} + multiply_iterator() noexcept = default; + ~multiply_iterator() noexcept = default; + multiply_iterator(const multiply_iterator &) noexcept = default; + multiply_iterator &operator=(const multiply_iterator &) noexcept = default; + + bool operator==(const multiply_iterator &other) const noexcept { + return iter_ == other.iter_ && pos_ == other.pos_ && len_ == other.len_; + } + + bool operator!=(const multiply_iterator &other) const noexcept { + return iter_ != other.iter_ || pos_ != other.pos_ || len_ != other.len_; + } + + iterator operator+(difference_type offset) const noexcept { + return iterator(iter_, len_, pos_ + offset); + } + + iterator operator-(difference_type offset) const noexcept { + return iterator(iter_, len_, pos_ + offset); + } + + difference_type operator-(iterator other) const noexcept { + return pos_ - other.pos_; + } + + bool operator<(iterator other) const noexcept { return pos_ < other.pos_; } + + bool operator>(iterator other) const noexcept { return pos_ > other.pos_; } + + bool operator<=(iterator other) const noexcept { return pos_ <= other.pos_; } + + bool operator>=(iterator other) const noexcept { return pos_ >= other.pos_; } + + iterator &operator++() noexcept { + ++pos_; + return *this; + } + + iterator operator++(int) noexcept { + iterator other = *this; + ++(*this); + return other; + } + + iterator &operator--() noexcept { + --pos_; + return *this; + } + + iterator operator--(int) noexcept { + iterator other = *this; + --(*this); + return other; + } + + iterator &operator+=(difference_type offset) noexcept { + pos_ += offset; + return *this; + } + + iterator &operator-=(difference_type offset) noexcept { + pos_ -= offset; + return *this; + } + + reference operator*() const noexcept { return *(iter_ + (pos_ % len_)); } + + reference operator[](difference_type offset) const noexcept { + return *(*this + offset); + } + + friend iterator operator+(difference_type n, iterator iter) { + return iter.pos_ + n; + } + + auto local() const + requires(dr::ranges::__detail::has_local) + { + auto iter = dr::ranges::__detail::local(iter_); + return multiply_iterator(std::move(iter), len_, pos_); + } + +private: + Iter iter_; + std::size_t len_; + long long pos_; +}; + +template + requires(rng::sized_range) +class multiply_view : public rng::view_interface> { +public: + template + multiply_view(R &&r, std::size_t n) + : base_(rng::views::all(std::forward(r))), n_(n) {} + + auto begin() const { + return multiply_iterator(rng::begin(base_), base_.size(), 0); + } + + auto end() const { + return multiply_iterator(rng::begin(base_), base_.size(), + n_ * base_.size()); + } + + auto size() const { return rng::size(base_); } + +private: + V base_; + std::size_t n_; +}; + +template +multiply_view(R &&r, std::size_t n) -> multiply_view>; + +} // namespace dr::__detail diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index f9598bbcd8..35c98eb8f7 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -48,6 +48,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -77,5 +81,11 @@ #include #include #include +#include #include +#include +#include +#include #include +#include +#include diff --git a/include/dr/mp/algorithms/equal.hpp b/include/dr/mp/algorithms/equal.hpp index 3901a10483..b2b6e278fc 100644 --- a/include/dr/mp/algorithms/equal.hpp +++ b/include/dr/mp/algorithms/equal.hpp @@ -26,11 +26,7 @@ bool equal(std::size_t root, bool root_provided, R1 &&r1, R2 &&r2) { }; auto zipped_views = views::zip(r1, r2); - - // we are using mp::transform instead of mp::views::transform due to - // compilation error refer to DRA-192 and test/gtest/mp/reduce.cpp - mp::distributed_vector compared(rng::distance(r1)); - mp::transform(zipped_views, compared.begin(), compare); + auto compared = dr::mp::views::transform(zipped_views, compare); auto min = [](double x, double y) { return std::min(x, y); }; if (root_provided) { diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp new file mode 100644 index 0000000000..14c61b41d2 --- /dev/null +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace dr::mp { + +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(int root, C &res, + distributed_sparse_matrix &a, + broadcasted_vector b) { + if (default_comm().rank() == root) { + assert(a.shape().first == res.size()); + assert(a.shape().second == b.size()); + } + a.local_gemv_and_collect(root, res, b.broadcasted_data(), 1); +} + +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(int root, C &res, + distributed_sparse_matrix &a, + broadcasted_slim_matrix b) { + if (default_comm().rank() == root) { + assert(a.shape().first * b.width() == res.size()); + } + a.local_gemv_and_collect(root, res, b.broadcasted_data(), b.width()); +} + +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(C &res, distributed_sparse_matrix &a, + broadcasted_vector b) { + std::vector workspace(res.size()); + gemv(0, workspace, a, b); + auto tmp = new T[res.size()]; + if (default_comm().rank() == 0) { + std::copy(workspace.begin(), workspace.end(), tmp); + } + default_comm().bcast(tmp, sizeof(T) * res.size(), 0); + std::copy(tmp, tmp + res.size(), res.begin()); + delete[] tmp; +} + +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(C &res, distributed_sparse_matrix &a, + broadcasted_slim_matrix b) { + std::vector workspace(res.size()); + gemv(0, workspace, a, b); + auto tmp = new T[res.size()]; + if (default_comm().rank() == 0) { + std::copy(workspace.begin(), workspace.end(), tmp); + } + default_comm().bcast(tmp, sizeof(T) * res.size(), 0); + std::copy(tmp, tmp + res.size(), res.begin()); + delete[] tmp; +} + +} // namespace dr::mp diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp index 839dc83a4a..dd2a16dd68 100644 --- a/include/dr/mp/algorithms/reduce.hpp +++ b/include/dr/mp/algorithms/reduce.hpp @@ -38,7 +38,14 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) { return std::reduce(dpl_policy(), dr::__detail::direct_iterator(rng::begin(r) + 1), dr::__detail::direct_iterator(rng::end(r)), - sycl_get(*rng::begin(r)), binary_op); + sycl_get_deref(rng::begin(r)), binary_op); + // We are not using below code, because we don't want to dereference + // rng::begin(r) beyond SYCL environment - the * operator may require + // complex operation that relies on GPU memory access (for example + // transform view iterator) return std::reduce(dpl_policy(), + // dr::__detail::direct_iterator(rng::begin(r) + 1), + // dr::__detail::direct_iterator(rng::end(r)), + // sycl_get(*rng::begin(r)), binary_op); } } #else diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp new file mode 100644 index 0000000000..ed16695ae4 --- /dev/null +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template > +class broadcasted_slim_matrix { +public: + broadcasted_slim_matrix() = default; + + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, + T **root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + for (auto i = 0; i < width; i++) { + if (use_sycl()) { + __detail::sycl_copy(root_data[i], root_data[i] + height, + _data + height * i); + } else { + rng::copy(root_data[i], root_data[i] + height, _data + height * i); + } + } + } + comm.bcast(_data, sizeof(T) * _data_size, root); + } + + template + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, + R root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), + std::to_address(root_data.end()), _data); + } else { + rng::copy(root_data.begin(), root_data.end(), _data); + } + } + std::size_t position = 0; + std::size_t reminder = sizeof(T) * _data_size; + while (reminder > INT_MAX) { + comm.bcast(((uint8_t *)_data) + position, INT_MAX, root); + position += INT_MAX; + reminder -= INT_MAX; + } + comm.bcast(((uint8_t *)_data) + position, reminder, root); + } + + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T *operator[](std::size_t index) { return _data + _height * index; } + + T *broadcasted_data() { return _data; } + auto width() { return _width; } + +private: + T *_data = nullptr; + std::size_t _data_size = 0; + std::size_t _width = 0; + std::size_t _height = 0; + + Allocator alloc; +}; +} // namespace dr::mp diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp new file mode 100644 index 0000000000..6058067563 --- /dev/null +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template > +class broadcasted_vector { +public: + broadcasted_vector() = default; + + template + void broadcast_data(std::size_t data_size, std::size_t root, R root_data, + dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = data_size; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), + std::to_address(root_data.end()), _data); + } else { + rng::copy(root_data.begin(), root_data.end(), _data); + } + } + comm.bcast(_data, sizeof(T) * _data_size, root); + } + + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T &operator[](std::size_t index) { return _data[index]; } + + T *broadcasted_data() { return _data; } + + auto size() { return _data_size; } + + auto begin() const { return _data; } + auto end() const { return begin() + _data_size; } + +private: + T *_data = nullptr; + std::size_t _data_size = 0; + Allocator alloc; +}; +} // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp new file mode 100644 index 0000000000..f03a5d09e6 --- /dev/null +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -0,0 +1,163 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include + +namespace dr::mp { +template +concept matrix_distibution = requires(T t, std::vector res, int *input) { + { t.fence() } -> std::same_as; + { t.segments() } -> rng::random_access_range; + { t.shape().first } -> std::convertible_to; + { t.shape().second } -> std::convertible_to; + { t.nnz() } -> std::same_as; + { t.get_segment_from_offset(int()) } -> std::same_as; + { t.get_id_in_segment(int()) } -> std::same_as; + T(dr::views::csr_matrix_view(), + distribution()); +}; + +template +concept vector_multiplicable = + requires(T t, std::vector res, T::elem_type *input) { + t.local_gemv_and_collect(int(), res, input, 1); + }; + +template > + requires(matrix_distibution) +class distributed_sparse_matrix { + +public: + using value_type = dr::matrix_entry; + using elem_type = T; + using index_type = I; + using key_type = dr::index; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using backend_type = BackendT; + + class iterator { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = typename distributed_sparse_matrix::value_type; + using difference_type = typename distributed_sparse_matrix::difference_type; + + iterator() {} + iterator(const distributed_sparse_matrix *parent, difference_type offset) + : parent_(parent), offset_(offset) {} + + auto operator+(difference_type n) const { + return iterator(parent_, offset_ + n); + } + friend auto operator+(difference_type n, const iterator &other) { + return other + n; + } + auto operator-(difference_type n) const { + return iterator(parent_, offset_ - n); + } + auto operator-(iterator other) const { return offset_ - other.offset_; } + + auto &operator+=(difference_type n) { + offset_ += n; + return *this; + } + auto &operator-=(difference_type n) { + offset_ -= n; + return *this; + } + auto &operator++() { + offset_++; + return *this; + } + auto operator++(int) { + auto old = *this; + offset_++; + return old; + } + auto &operator--() { + offset_--; + return *this; + } + auto operator--(int) { + auto old = *this; + offset_--; + return old; + } + + bool operator==(iterator other) const { + if (parent_ == nullptr || other.parent_ == nullptr) { + return false; + } else { + return offset_ == other.offset_; + } + } + auto operator<=>(iterator other) const { + assert(parent_ == other.parent_); + return offset_ <=> other.offset_; + } + + auto operator*() const { + auto segment_id = parent_->distribution_.get_segment_from_offset(offset_); + auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_); + return parent_->segments()[segment_id][id_in_segment]; + } + auto operator[](difference_type n) const { return *(*this + n); } + + auto local() { + auto segment_id = parent_->distribution_.get_segment_from_offset(offset_); + auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_); + return (parent_->segments()[segment_id].begin() + id_in_segment).local(); + } + + auto segments() { + return dr::__detail::drop_segments(parent_->segments(), offset_); + } + + private: + const distributed_sparse_matrix *parent_ = nullptr; + difference_type offset_; + }; + + distributed_sparse_matrix(const distributed_sparse_matrix &) = delete; + distributed_sparse_matrix & + operator=(const distributed_sparse_matrix &) = delete; + distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } + + /// Constructor + distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, + std::size_t root = 0, + distribution dist = distribution()) + : distribution_(csr_view, dist, root) {} + + /// Returns iterator to beginning + auto begin() const { return iterator(this, 0); } + /// Returns iterator to end + auto end() const { return begin() + distribution_.nnz(); } + + /// Returns size + auto size() const { return distribution_.nnz(); } + + auto shape() const { return distribution_.shape(); } + /// Returns reference using index + auto operator[](difference_type n) const { return *(begin() + n); } + // auto &halo() const { return *halo_; } TODO + + auto segments() const { return distribution_.segments(); } + + void fence() { distribution_.fence(); } + + template + requires(vector_multiplicable) + auto local_gemv_and_collect(std::size_t root, C &res, T *vals, + std::size_t val_width) const { + distribution_.local_gemv_and_collect(root, res, vals, val_width); + } + +private: + MatrixDistrT distribution_; +}; +} // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 2611963064..d63c4c084f 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -63,8 +63,20 @@ class MpiBackend { #if (MPI_VERSION >= 4) || \ (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) - // 64-bit API inside - win_.put(src, datalen, segment_index, offset); + if (mp::use_sycl()) { + // 32-bit API inside for sycl based buffers + for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { + std::size_t s = std::min(remainder, (std::size_t)INT_MAX); + DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", + default_comm().rank(), __LINE__, s, off, offset + off); + win_.put((uint8_t *)src + off, s, segment_index, offset + off); + off += s; + remainder -= s; + } + } else { + // 64-bit API inside + win_.put(src, datalen, segment_index, offset); + } #else for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { std::size_t s = std::min(remainder, (std::size_t)INT_MAX); @@ -77,9 +89,9 @@ class MpiBackend { #endif } - std::size_t getrank() { return win_.communicator().rank(); } + std::size_t getrank() const { return win_.communicator().rank(); } - void fence() { win_.fence(); } + void fence() const { win_.fence(); } }; #ifdef DRISHMEM @@ -121,13 +133,13 @@ class IshmemBackend { ishmem_putmem(dst, src, datalen, segment_index); } - std::size_t getrank() { + std::size_t getrank() const { auto my_process_segment_index = ishmem_my_pe(); DRLOG("called ishmem_my_pe() -> {}", my_process_segment_index); return my_process_segment_index; } - void fence() { + void fence() const { // TODO: to have locality use ishmemx_fence_work_group ishmem_fence(); } @@ -276,6 +288,12 @@ template class distributed_vector { void fence() { backend.fence(); } + auto segment_size() const { return segment_size_; } + + auto get_segment_offset(std::size_t segment_id) const { + return segment_id * segment_size_; + } + private: void init(auto size, auto dist) { size_ = size; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp new file mode 100644 index 0000000000..aeb0461115 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -0,0 +1,390 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include +#include + +namespace dr::mp { + +template +class csr_eq_distribution { + using view_tuple = std::tuple; + +public: + using value_type = dr::matrix_entry; + using segment_type = csr_eq_segment; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; + + csr_eq_distribution(const csr_eq_distribution &) = delete; + csr_eq_distribution &operator=(const csr_eq_distribution &) = delete; + csr_eq_distribution(csr_eq_distribution &&) { assert(false); } + + csr_eq_distribution(dr::views::csr_matrix_view csr_view, + distribution dist = distribution(), + std::size_t root = 0) { + init(csr_view, dist, root); + } + + ~csr_eq_distribution() { + if (!finalized()) { + fence(); + if (rows_data_ != nullptr) { + rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); + tuple_alloc.deallocate(view_helper_const, 1); + } + } + } + std::size_t get_id_in_segment(std::size_t offset) const { + return offset % segment_size_; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + return offset / segment_size_; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const { return nnz_; } + auto shape() const { return shape_; } + void fence() const { rows_backend_.fence(); } + + template + auto local_gemv(C &res, T *vals, std::size_t vals_width) const { + auto rank = rows_backend_.getrank(); + if (nnz_ <= segment_size_ * rank) { + return; + } + auto vals_len = shape_[1]; + auto size = row_sizes_[rank]; + auto res_col_len = row_sizes_[default_comm().rank()]; + if (dr::mp::use_sycl()) { + auto localVals = dr::__detail::direct_iterator( + dr::mp::local_segment(*vals_data_).begin()); + auto localCols = dr::__detail::direct_iterator( + dr::mp::local_segment(*cols_data_).begin()); + auto offset = rank * segment_size_; + auto real_segment_size = + std::min(nnz_ - rank * segment_size_, segment_size_); + auto local_data = rows_data_; + auto division = std::max(1ul, real_segment_size / 50); + auto one_computation_size = (real_segment_size + division - 1) / division; + auto row_size = row_size_; + dr::__detail::parallel_for_workaround( + dr::mp::sycl_queue(), sycl::range<1>{division}, + [=](auto idx) { + std::size_t lower_bound = one_computation_size * idx; + std::size_t upper_bound = + std::min(one_computation_size * (idx + 1), real_segment_size); + std::size_t position = lower_bound + offset; + std::size_t first_row = rng::distance( + local_data, + std::upper_bound(local_data, local_data + row_size, position) - + 1); + for (auto j = 0; j < vals_width; j++) { + auto row = first_row; + T sum = 0; + + for (auto i = lower_bound; i < upper_bound; i++) { + while (row + 1 < row_size && + local_data[row + 1] <= offset + i) { + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; + row++; + sum = 0; + } + auto colNum = localCols[i] + j * vals_len; + auto matrixVal = vals[colNum]; + auto vectorVal = localVals[i]; + + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; + } + }) + .wait(); + } else { + auto row_i = -1; + auto position = segment_size_ * rank; + auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank); + auto current_row_position = rows_data_[0]; + auto local_vals = dr::mp::local_segment(*vals_data_); + auto local_cols = dr::mp::local_segment(*cols_data_); + + for (int i = 0; i < elem_count; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = rows_data_[row_i + 1]; + } + for (int j = 0; j < vals_width; j++) { + res[row_i + j * res_col_len] += + local_vals[i] * vals[local_cols[i] + j * vals_len]; + } + } + } + } + + template + auto local_gemv_and_collect(std::size_t root, C &res, T *vals, + std::size_t vals_width) const { + assert(res.size() == shape_.first * vals_width); + __detail::allocator alloc; + auto res_alloc = + alloc.allocate(row_sizes_[default_comm().rank()] * vals_width); + if (use_sycl()) { + sycl_queue() + .fill(res_alloc, 0, row_sizes_[default_comm().rank()] * vals_width) + .wait(); + } else { + std::fill(res_alloc, + res_alloc + row_sizes_[default_comm().rank()] * vals_width, 0); + } + + local_gemv(res_alloc, vals, vals_width); + gather_gemv_vector(root, res, res_alloc, vals_width); + fence(); + alloc.deallocate(res_alloc, row_sizes_[default_comm().rank()] * vals_width); + } + +private: + friend csr_eq_segment_iterator; + + template + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, + std::size_t vals_width) const { + auto communicator = default_comm(); + __detail::allocator alloc; + long long *counts = new long long[communicator.size()]; + for (auto i = 0; i < communicator.size(); i++) { + counts[i] = row_sizes_[i] * sizeof(T) * vals_width; + } + + if (communicator.rank() == root) { + long *offsets = new long[communicator.size()]; + offsets[0] = 0; + for (auto i = 0; i < communicator.size() - 1; i++) { + offsets[i + 1] = offsets[i] + counts[i]; + } + auto gathered_res = alloc.allocate(max_row_size_ * vals_width); + communicator.gatherv(partial_res, counts, offsets, gathered_res, root); + T *gathered_res_host; + + if (use_sycl()) { + gathered_res_host = new T[max_row_size_ * vals_width]; + __detail::sycl_copy(gathered_res, gathered_res_host, + max_row_size_ * vals_width); + } else { + gathered_res_host = gathered_res; + } + rng::fill(res, 0); + + for (auto k = 0; k < vals_width; k++) { + auto current_offset = 0; + for (auto i = 0; i < communicator.size(); i++) { + auto first_row = row_offsets_[i]; + auto last_row = row_offsets_[i] + row_sizes_[i]; + auto row_size = row_sizes_[i]; + if (first_row < last_row) { + res[first_row + k * shape_[0]] += + gathered_res_host[vals_width * current_offset + k * row_size]; + } + if (first_row < last_row - 1) { + auto piece_start = gathered_res_host + vals_width * current_offset + + k * row_size + 1; + std::copy(piece_start, piece_start + last_row - first_row - 1, + res.begin() + first_row + k * shape_[0] + 1); + } + current_offset += row_sizes_[i]; + } + } + + if (use_sycl()) { + delete[] gathered_res_host; + } + delete[] offsets; + alloc.deallocate(gathered_res, max_row_size_ * vals_width); + } else { + communicator.gatherv(partial_res, counts, nullptr, nullptr, root); + } + delete[] counts; + } + + std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; } + + void init(dr::views::csr_matrix_view csr_view, auto dist, + std::size_t root) { + distribution_ = dist; + auto rank = rows_backend_.getrank(); + + std::size_t initial_data[3]; + if (root == rank) { + initial_data[0] = csr_view.size(); + initial_data[1] = csr_view.shape().first; + initial_data[2] = csr_view.shape().second; + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } else { + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } + + nnz_ = initial_data[0]; + shape_ = {initial_data[1], initial_data[2]}; + vals_data_ = std::make_shared>(nnz_); + cols_data_ = std::make_shared>(nnz_); + dr::mp::copy(root, + std::ranges::subrange(csr_view.values_data(), + csr_view.values_data() + nnz_), + vals_data_->begin()); + dr::mp::copy(root, + std::ranges::subrange(csr_view.colind_data(), + csr_view.colind_data() + nnz_), + cols_data_->begin()); + + auto row_info_size = default_comm().size() * 2 + 1; + __detail::allocator alloc; + std::size_t *row_information = new std::size_t[row_info_size]; + row_offsets_.reserve(default_comm().size()); + row_sizes_.reserve(default_comm().size()); + if (root == default_comm().rank()) { + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = vals_data_->get_segment_offset(i); + auto last_index = vals_data_->get_segment_offset(i + 1) - 1; + auto lower_limit = + rng::distance(csr_view.rowptr_data(), + std::upper_bound(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_[0], + first_index)) - + 1; + auto higher_limit = rng::distance( + csr_view.rowptr_data(), + std::upper_bound(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_[0], last_index)); + row_offsets_.push_back(lower_limit); + row_sizes_.push_back(higher_limit - lower_limit); + row_information[i] = lower_limit; + row_information[default_comm().size() + i] = higher_limit - lower_limit; + max_row_size_ = max_row_size_ + row_sizes_.back(); + } + row_information[default_comm().size() * 2] = max_row_size_; + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, + root); + } else { + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, + root); + for (int i = 0; i < default_comm().size(); i++) { + row_offsets_.push_back(row_information[i]); + row_sizes_.push_back(row_information[default_comm().size() + i]); + } + max_row_size_ = row_information[default_comm().size() * 2]; + } + delete[] row_information; + row_size_ = std::max(row_sizes_[rank], static_cast(1)); + rows_data_ = + static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + + fence(); + if (rank == root) { + for (std::size_t i = 0; i < default_comm().size(); i++) { + auto lower_limit = row_offsets_[i]; + auto row_size = row_sizes_[i]; + if (row_size > 0) { + rows_backend_.putmem(csr_view.rowptr_data() + lower_limit, 0, + row_size * sizeof(I), i); + } + } + } + + std::size_t segment_index = 0; + segment_size_ = vals_data_->segment_size(); + assert(segment_size_ == cols_data_->segment_size()); + for (std::size_t i = 0; i < nnz_; i += segment_size_) { + segments_.emplace_back(this, segment_index++, + std::min(segment_size_, nnz_ - i), segment_size_); + } + auto local_rows = rows_data_; + auto real_val_size = std::min(vals_data_->segment_size(), + nnz_ - vals_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(row_size_, row_offsets_[rank], + segment_size_ * rank, local_rows); + view_helper_const = tuple_alloc.allocate(1); + + if (use_sycl()) { + sycl_queue() + .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)) + .wait(); + } else { + view_helper_const[0] = my_tuple; + } + + auto local_cols = static_cast(nullptr); + auto local_vals = static_cast(nullptr); + if (cols_data_->segments().size() > rank) { + local_cols = cols_data_->segments()[rank].begin().local(); + local_vals = vals_data_->segments()[rank].begin().local(); + local_view = std::make_shared(get_elem_view( + real_val_size, view_helper_const, local_cols, local_vals, rank)); + } + fence(); + } + + static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, + index_type *local_cols, elem_type *local_vals, + std::size_t rank) { + auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); + auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + // we need to use multiply_view here, + // because lambda is not properly copied to sycl environment + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view( + rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = + rng::views::zip(enumerated_zipped, multiply_range); + + auto transformer = [=](auto x) { + auto [entry, tuple] = x; + auto [row_size, row_offset, offset, local_rows] = tuple; + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = + rng::distance(local_rows, + std::upper_bound(local_rows, local_rows + row_size, + offset + index) - + 1) + + row_offset; + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + return rng::transform_view(enumerted_with_data, std::move(transformer)); + } + + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0)); + + dr::mp::__detail::allocator tuple_alloc; + view_tuple *view_helper_const; + std::shared_ptr local_view = nullptr; + + std::size_t segment_size_ = 0; + std::size_t row_size_ = 0; + std::size_t max_row_size_ = 0; + std::vector row_offsets_; + std::vector row_sizes_; + + index_type *rows_data_ = nullptr; + BackendT rows_backend_; + + distribution distribution_; + dr::index shape_; + std::size_t nnz_; + std::vector segments_; + std::shared_ptr> vals_data_; + std::shared_ptr> cols_data_; +}; +} // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp new file mode 100644 index 0000000000..2caa8af843 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -0,0 +1,304 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template class csr_eq_segment_iterator; + +template class csr_eq_segment_reference { + using iterator = csr_eq_segment_iterator; + +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + + csr_eq_segment_reference(const iterator it) : iterator_(it) {} + + operator value_type() const { return iterator_.get(); } + operator std::pair, elem_type>() const { + return iterator_.get(); + } + + template auto get() const noexcept { + if constexpr (Index == 0) { + return iterator_.get_index(); + } + if constexpr (Index == 1) { + return iterator_.get_value(); + } + } + + auto operator=(const value_type &value) const { + iterator_.put(value); + return *this; + } + auto operator=(const csr_eq_segment_reference &other) const { + *this = value_type(other); + return *this; + } + auto operator&() const { return iterator_; } + +private: + const iterator iterator_; +}; // csr_eq_segment_reference + +template class csr_eq_segment_iterator { +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + using difference_type = typename DSM::difference_type; + + csr_eq_segment_iterator() = default; + csr_eq_segment_iterator(DSM *dsm, std::size_t segment_index, + std::size_t index) { + dsm_ = dsm; + segment_index_ = segment_index; + index_ = index; + } + + auto operator<=>(const csr_eq_segment_iterator &other) const noexcept { + // assertion below checks against compare dereferenceable iterator to a + // singular iterator and against attempt to compare iterators from different + // sequences like _Safe_iterator does + assert(dsm_ == other.dsm_); + return segment_index_ == other.segment_index_ + ? index_ <=> other.index_ + : segment_index_ <=> other.segment_index_; + } + + // Comparison + bool operator==(const csr_eq_segment_iterator &other) const noexcept { + return (*this <=> other) == 0; + } + + // Only this arithmetic manipulate internal state + auto &operator+=(difference_type n) { + assert(dsm_ != nullptr); + assert(n >= 0 || static_cast(index_) >= -n); + index_ += n; + return *this; + } + + auto &operator-=(difference_type n) { return *this += (-n); } + + difference_type + operator-(const csr_eq_segment_iterator &other) const noexcept { + assert(dsm_ != nullptr && dsm_ == other.dsm_); + assert(index_ >= other.index_); + return index_ - other.index_; + } + + // prefix + auto &operator++() { + *this += 1; + return *this; + } + auto &operator--() { + *this -= 1; + return *this; + } + + // postfix + auto operator++(int) { + auto prev = *this; + *this += 1; + return prev; + } + auto operator--(int) { + auto prev = *this; + *this -= 1; + return prev; + } + + auto operator+(difference_type n) const { + auto p = *this; + p += n; + return p; + } + auto operator-(difference_type n) const { + auto p = *this; + p -= n; + return p; + } + + // When *this is not first in the expression + friend auto operator+(difference_type n, + const csr_eq_segment_iterator &other) { + return other + n; + } + + // dereference + auto operator*() const { + assert(dsm_ != nullptr); + return csr_eq_segment_reference{*this}; + } + auto operator[](difference_type n) const { + assert(dsm_ != nullptr); + return *(*this + n); + } + + void get(value_type *dst, std::size_t size) const { + auto elems = new elem_type[size]; + auto indexes = new dr::index[size]; + get_value(elems, size); + get_index(indexes, size); + for (std::size_t i = 0; i < size; i++) { + *(dst + i) = {indexes[i], elems[i]}; + } + } + + value_type get() const { + value_type val; + get(&val, 1); + return val; + } + + void get_value(elem_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_) + .get(dst, size); + } + + elem_type get_value() const { + elem_type val; + get_value(&val, 1); + return val; + } + + void get_index(dr::index *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + auto col_data = new index_type[size]; + (dsm_->cols_data_->segments()[segment_index_].begin() + index_) + .get(col_data, size); + index_type *rows; + std::size_t rows_length = dsm_->get_row_size(segment_index_); + + if (rank() == dsm_->rows_backend_.getrank()) { + rows = dsm_->rows_data_; + } else { + rows = new index_type[rows_length]; + dsm_->rows_backend_.getmem(rows, 0, rows_length * sizeof(index_type), + segment_index_); + } + auto position = + dsm_->cols_data_->get_segment_offset(segment_index_) + index_; + auto rows_iter = rows + 1; + auto cols_iter = col_data; + auto iter = dst; + std::size_t current_row = dsm_->row_offsets_[segment_index_]; + std::size_t last_row = current_row + rows_length - 1; + for (int i = 0; i < size; i++) { + while (current_row < last_row && *rows_iter <= position + i) { + rows_iter++; + current_row++; + } + iter->first = current_row; + iter->second = *cols_iter; + cols_iter++; + iter++; + } + if (rank() != dsm_->rows_backend_.getrank()) { + delete[] rows; + } + delete[] col_data; + } + + dr::index get_index() const { + dr::index val; + get_index(&val, 1); + return val; + } + + void put(const value_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_) + .put(dst, size); + } + + void put(const value_type &value) const { put(&value, 1); } + + auto rank() const { + assert(dsm_ != nullptr); + return segment_index_; + } + + auto segments() const { + assert(dsm_ != nullptr); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, + index_); + } + + auto local() const { + const auto my_process_segment_index = dsm_->rows_backend_.getrank(); + assert(my_process_segment_index == segment_index_); + if (dsm_->local_view == nullptr) { + throw std::runtime_error("Requesting not existing local segment"); + } + return dsm_->local_view->begin(); + } + +private: + // all fields need to be initialized by default ctor so every default + // constructed iter is equal to any other default constructed iter + DSM *dsm_ = nullptr; + std::size_t segment_index_ = 0; + std::size_t index_ = 0; +}; // csr_eq_segment_iterator + +template class csr_eq_segment { +private: + using iterator = csr_eq_segment_iterator; + +public: + using difference_type = std::ptrdiff_t; + csr_eq_segment() = default; + csr_eq_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + std::size_t reserved) { + dsm_ = dsm; + segment_index_ = segment_index; + size_ = size; + reserved_ = reserved; + assert(dsm_ != nullptr); + } + + auto size() const { + assert(dsm_ != nullptr); + return size_; + } + + auto begin() const { return iterator(dsm_, segment_index_, 0); } + auto end() const { return begin() + size(); } + auto reserved() const { return reserved_; } + + auto operator[](difference_type n) const { return *(begin() + n); } + + bool is_local() const { return segment_index_ == default_comm().rank(); } + +private: + DSM *dsm_ = nullptr; + std::size_t segment_index_; + std::size_t size_; + std::size_t reserved_; +}; // csr_eq_segment + +} // namespace dr::mp + +namespace std { +template +struct tuple_size> + : std::integral_constant {}; + +template +struct tuple_element> + : tuple_element, + typename DSM::elem_type>> {}; + +} // namespace std diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp new file mode 100644 index 0000000000..70fd7f02d7 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -0,0 +1,376 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include +#include +#include + +namespace dr::mp { +template +class csr_row_distribution { + using view_tuple = std::tuple; + +public: + using value_type = dr::matrix_entry; + using segment_type = csr_row_segment; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; + + csr_row_distribution(const csr_row_distribution &) = delete; + csr_row_distribution &operator=(const csr_row_distribution &) = delete; + csr_row_distribution(csr_row_distribution &&) { assert(false); } + + csr_row_distribution(dr::views::csr_matrix_view csr_view, + distribution dist = distribution(), + std::size_t root = 0) { + init(csr_view, dist, root); + } + + ~csr_row_distribution() { + if (!finalized()) { + fence(); + if (vals_data_ != nullptr) { + vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); + cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); + alloc.deallocate(view_helper_const, 1); + } + } + } + std::size_t get_id_in_segment(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = + std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset) - 1; + return offset - *pos_iter; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = + std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset); + return rng::distance(val_offsets_.begin(), pos_iter) - 1; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const { return nnz_; } + auto shape() const { return shape_; } + void fence() const { + vals_backend_.fence(); + cols_backend_.fence(); + } + template + auto local_gemv(C &res, T *vals, std::size_t vals_width) const { + auto rank = cols_backend_.getrank(); + if (shape_[0] <= segment_size_ * rank) + return; + auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); + auto vals_len = shape_[1]; + if (dr::mp::use_sycl()) { + auto local_vals = vals_data_; + auto local_cols = cols_data_; + auto offset = val_offsets_[rank]; + auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]); + auto rows_data = dr::__detail::direct_iterator( + dr::mp::local_segment(*rows_data_).begin()); + auto res_col_len = segment_size_; + std::size_t wg = 32; + while (vals_width * size * wg > INT_MAX) { + // this check is necessary, because sycl does not permit ranges + // exceeding integer limit + wg /= 2; + } + assert(wg > 0); + dr::mp::sycl_queue() + .submit([&](auto &&h) { + h.parallel_for( + sycl::nd_range<1>(vals_width * size * wg, wg), [=](auto item) { + auto input_j = item.get_group(0) / size; + auto idx = item.get_group(0) % size; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); + std::size_t lower_bound = 0; + if (rows_data[idx] > offset) { + lower_bound = rows_data[idx] - offset; + } + std::size_t upper_bound = real_segment_size; + if (idx < size - 1) { + upper_bound = rows_data[idx + 1] - offset; + } + T sum = 0; + for (auto i = lower_bound + local_id; i < upper_bound; + i += group_size) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum + input_j * vals_len]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; + } + + sycl::atomic_ref + c_ref(res[idx + input_j * res_col_len]); + c_ref += sum; + }); + }) + .wait(); + } else { + auto local_rows = dr::mp::local_segment(*rows_data_); + auto val_count = val_sizes_[rank]; + auto row_i = 0; + auto position = val_offsets_[rank]; + auto current_row_position = local_rows[1]; + + for (int i = 0; i < val_count; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = local_rows[row_i + 1]; + } + for (auto j = 0; j < vals_width; j++) { + res[row_i + j * segment_size_] += + vals_data_[i] * vals[cols_data_[i] + j * vals_len]; + } + } + } + } + + template + auto local_gemv_and_collect(std::size_t root, C &res, T *&vals, + std::size_t vals_width) const { + assert(res.size() == shape_.first * vals_width); + __detail::allocator alloc; + auto res_alloc = alloc.allocate(segment_size_ * vals_width); + if (use_sycl()) { + sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width).wait(); + } else { + std::fill(res_alloc, res_alloc + segment_size_ * vals_width, 0); + } + + local_gemv(res_alloc, vals, vals_width); + + gather_gemv_vector(root, res, res_alloc, vals_width); + fence(); + alloc.deallocate(res_alloc, segment_size_ * vals_width); + } + +private: + friend csr_row_segment_iterator; + + template + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, + std::size_t vals_width) const { + auto communicator = default_comm(); + __detail::allocator alloc; + + if (communicator.rank() == root) { + auto scratch = + alloc.allocate(segment_size_ * communicator.size() * vals_width); + communicator.gather(partial_res, scratch, segment_size_ * vals_width, + root); + T *temp = nullptr; + if (use_sycl()) { + temp = new T[res.size()]; + } + for (auto j = 0; j < communicator.size(); j++) { + if (j * segment_size_ >= shape_.first) { + break; + } + auto comm_segment_size = + std::min(segment_size_, shape_.first - j * segment_size_); + + for (auto i = 0; i < vals_width; i++) { + auto piece_start = + scratch + j * vals_width * segment_size_ + i * segment_size_; + + if (use_sycl()) { + __detail::sycl_copy(piece_start, + temp + shape_.first * i + j * segment_size_, + comm_segment_size); + } else { + std::copy(piece_start, piece_start + comm_segment_size, + res.begin() + shape_.first * i + j * segment_size_); + } + } + } + if (use_sycl()) { + std::copy(temp, temp + res.size(), res.begin()); + delete[] temp; + } + alloc.deallocate(scratch, + segment_size_ * communicator.size() * vals_width); + } else { + communicator.gather(partial_res, static_cast(nullptr), + segment_size_ * vals_width, root); + } + } + void init(dr::views::csr_matrix_view csr_view, auto dist, + std::size_t root) { + distribution_ = dist; + auto rank = vals_backend_.getrank(); + + std::size_t initial_data[3]; + if (root == rank) { + initial_data[0] = csr_view.size(); + initial_data[1] = csr_view.shape().first; + initial_data[2] = csr_view.shape().second; + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } else { + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } + + nnz_ = initial_data[0]; + shape_ = {initial_data[1], initial_data[2]}; + + rows_data_ = std::make_shared>(shape_.first); + + dr::mp::copy(root, + std::ranges::subrange(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_.first), + rows_data_->begin()); + + auto row_info_size = default_comm().size() * 2; + std::size_t *val_information = new std::size_t[row_info_size]; + val_offsets_.reserve(row_info_size); + val_sizes_.reserve(row_info_size); + if (rank == root) { + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = rows_data_->get_segment_offset(i); + if (first_index > shape_.first) { + val_offsets_.push_back(nnz_); + val_sizes_.push_back(0); + continue; + } + std::size_t lower_limit = csr_view.rowptr_data()[first_index]; + std::size_t higher_limit = nnz_; + if (rows_data_->get_segment_offset(i + 1) < shape_.first) { + auto last_index = rows_data_->get_segment_offset(i + 1); + higher_limit = csr_view.rowptr_data()[last_index]; + } + val_offsets_.push_back(lower_limit); + val_sizes_.push_back(higher_limit - lower_limit); + val_information[i] = lower_limit; + val_information[i + default_comm().size()] = higher_limit - lower_limit; + } + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, + root); + } else { + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, + root); + for (int i = 0; i < default_comm().size(); i++) { + val_offsets_.push_back(val_information[i]); + val_sizes_.push_back(val_information[default_comm().size() + i]); + } + } + delete[] val_information; + vals_size_ = std::max(val_sizes_[rank], static_cast(1)); + + cols_data_ = + static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); + vals_data_ = + static_cast(vals_backend_.allocate(vals_size_ * sizeof(T))); + + fence(); + if (rank == root) { + for (std::size_t i = 0; i < default_comm().size(); i++) { + auto lower_limit = val_offsets_[i]; + auto row_size = val_sizes_[i]; + if (row_size > 0) { + vals_backend_.putmem(csr_view.values_data() + lower_limit, 0, + row_size * sizeof(T), i); + cols_backend_.putmem(csr_view.colind_data() + lower_limit, 0, + row_size * sizeof(I), i); + } + } + } + + std::size_t segment_index = 0; + segment_size_ = rows_data_->segment_size(); + for (std::size_t i = 0; i < default_comm().size(); i++) { + segments_.emplace_back( + this, segment_index++, val_sizes_[i], + std::max(val_sizes_[i], static_cast(1))); + } + + auto local_rows = static_cast(nullptr); + if (rows_data_->segments().size() > rank) { + local_rows = rows_data_->segments()[rank].begin().local(); + } + auto offset = val_offsets_[rank]; + auto real_row_size = + std::min(rows_data_->segment_size(), + shape_.first - rows_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset, + local_rows); + view_helper_const = alloc.allocate(1); + + if (use_sycl()) { + sycl_queue() + .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)) + .wait(); + } else { + view_helper_const[0] = my_tuple; + } + + if (rows_data_->segments().size() > rank) { + local_view = std::make_shared(get_elem_view( + vals_size_, view_helper_const, cols_data_, vals_data_, rank)); + } + fence(); + } + + static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, + index_type *local_cols, elem_type *local_vals, + std::size_t rank) { + auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); + auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + // we need to use multiply_view here, + // because lambda is not properly copied to sycl environment + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view( + rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = + rng::views::zip(enumerated_zipped, multiply_range); + + auto transformer = [=](auto x) { + auto [entry, tuple] = x; + auto [row_size, row_offset, offset, local_rows] = tuple; + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = + rng::distance(local_rows, + std::upper_bound(local_rows, local_rows + row_size, + offset + index) - + 1) + + row_offset; + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + return rng::transform_view(enumerted_with_data, std::move(transformer)); + } + + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0)); + + dr::mp::__detail::allocator alloc; + view_tuple *view_helper_const; + std::shared_ptr local_view; + + std::size_t segment_size_ = 0; + std::size_t vals_size_ = 0; + std::vector val_offsets_; + std::vector val_sizes_; + + index_type *cols_data_ = nullptr; + BackendT cols_backend_; + + elem_type *vals_data_ = nullptr; + BackendT vals_backend_; + + distribution distribution_; + dr::index shape_; + std::size_t nnz_; + std::vector segments_; + std::shared_ptr> rows_data_ = nullptr; +}; +} // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp new file mode 100644 index 0000000000..ce0f627e3e --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -0,0 +1,292 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { +template class csr_row_segment_iterator; + +template class csr_row_segment_reference { + using iterator = csr_row_segment_iterator; + +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + + csr_row_segment_reference(const iterator it) : iterator_(it) {} + + operator value_type() const { return iterator_.get(); } + operator std::pair, elem_type>() const { + return iterator_.get(); + } + + template auto get() const noexcept { + if constexpr (Index == 0) { + return iterator_.get_index(); + } + if constexpr (Index == 1) { + return iterator_.get_value(); + } + } + + auto operator=(const csr_row_segment_reference &other) const { + *this = value_type(other); + return *this; + } + auto operator&() const { return iterator_; } + +private: + const iterator iterator_; +}; // csr_row_segment_reference + +template class csr_row_segment_iterator { +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + using difference_type = typename DSM::difference_type; + + csr_row_segment_iterator() = default; + csr_row_segment_iterator(DSM *dsm, std::size_t segment_index, + std::size_t index) { + dsm_ = dsm; + segment_index_ = segment_index; + index_ = index; + } + + auto operator<=>(const csr_row_segment_iterator &other) const noexcept { + // assertion below checks against compare dereferenceable iterator to a + // singular iterator and against attempt to compare iterators from different + // sequences like _Safe_iterator does + assert(dsm_ == other.dsm_); + return segment_index_ == other.segment_index_ + ? index_ <=> other.index_ + : segment_index_ <=> other.segment_index_; + } + + // Comparison + bool operator==(const csr_row_segment_iterator &other) const noexcept { + return (*this <=> other) == 0; + } + + // Only this arithmetic manipulate internal state + auto &operator+=(difference_type n) { + assert(dsm_ != nullptr); + assert(n >= 0 || static_cast(index_) >= -n); + index_ += n; + return *this; + } + + auto &operator-=(difference_type n) { return *this += (-n); } + + difference_type + operator-(const csr_row_segment_iterator &other) const noexcept { + assert(dsm_ != nullptr && dsm_ == other.dsm_); + assert(index_ >= other.index_); + return index_ - other.index_; + } + + // prefix + auto &operator++() { + *this += 1; + return *this; + } + auto &operator--() { + *this -= 1; + return *this; + } + + // postfix + auto operator++(int) { + auto prev = *this; + *this += 1; + return prev; + } + auto operator--(int) { + auto prev = *this; + *this -= 1; + return prev; + } + + auto operator+(difference_type n) const { + auto p = *this; + p += n; + return p; + } + auto operator-(difference_type n) const { + auto p = *this; + p -= n; + return p; + } + + // When *this is not first in the expression + friend auto operator+(difference_type n, + const csr_row_segment_iterator &other) { + return other + n; + } + + // dereference + auto operator*() const { + assert(dsm_ != nullptr); + return csr_row_segment_reference{*this}; + } + auto operator[](difference_type n) const { + assert(dsm_ != nullptr); + return *(*this + n); + } + + void get(value_type *dst, std::size_t size) const { + auto elems = new elem_type[size]; + auto indexes = new dr::index[size]; + get_value(elems, size); + get_index(indexes, size); + for (std::size_t i = 0; i < size; i++) { + *(dst + i) = {indexes[i], elems[i]}; + } + } + + value_type get() const { + value_type val; + get(&val, 1); + return val; + } + + void get_value(elem_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + dsm_->vals_backend_.getmem(dst, index_ * sizeof(elem_type), + size * sizeof(elem_type), segment_index_); + } + + elem_type get_value() const { + elem_type val; + get_value(&val, 1); + return val; + } + + void get_index(dr::index *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + index_type *col_data; + if (rank() == dsm_->cols_backend_.getrank()) { + col_data = dsm_->cols_data_ + index_; + } else { + col_data = new index_type[size]; + dsm_->cols_backend_.getmem(col_data, index_ * sizeof(index_type), + size * sizeof(index_type), segment_index_); + } + index_type *rows; + std::size_t rows_length = dsm_->segment_size_; + rows = new index_type[rows_length]; + (dsm_->rows_data_->segments()[segment_index_].begin()) + .get(rows, rows_length); + + auto position = dsm_->val_offsets_[segment_index_] + index_; + auto rows_iter = rows + 1; + index_type *cols_iter = col_data; + auto iter = dst; + std::size_t current_row = dsm_->segment_size_ * segment_index_; + std::size_t last_row = + std::min(current_row + rows_length - 1, dsm_->shape_[0] - 1); + + for (int i = 0; i < size; i++) { + while (current_row < last_row && *rows_iter <= position + i) { + rows_iter++; + current_row++; + } + iter->first = current_row; + iter->second = *cols_iter; + cols_iter++; + iter++; + } + if (rank() != dsm_->cols_backend_.getrank()) { + delete[] col_data; + } + delete[] rows; + } + + dr::index get_index() const { + dr::index val; + get_index(&val, 1); + return val; + } + + auto rank() const { + assert(dsm_ != nullptr); + return segment_index_; + } + + auto segments() const { + assert(dsm_ != nullptr); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, + index_); + } + + auto local() const { + const auto my_process_segment_index = dsm_->vals_backend_.getrank(); + assert(my_process_segment_index == segment_index_); + if (dsm_->local_view == nullptr) { + throw std::runtime_error("Requesting not existing local segment"); + } + return dsm_->local_view->begin(); + } + +private: + // all fields need to be initialized by default ctor so every default + // constructed iter is equal to any other default constructed iter + DSM *dsm_ = nullptr; + std::size_t segment_index_ = 0; + std::size_t index_ = 0; +}; // csr_row_segment_iterator + +template class csr_row_segment { +private: + using iterator = csr_row_segment_iterator; + +public: + using difference_type = std::ptrdiff_t; + csr_row_segment() = default; + csr_row_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + std::size_t reserved) { + dsm_ = dsm; + segment_index_ = segment_index; + size_ = size; + reserved_ = reserved; + assert(dsm_ != nullptr); + } + + auto size() const { + assert(dsm_ != nullptr); + return size_; + } + + auto begin() const { return iterator(dsm_, segment_index_, 0); } + auto end() const { return begin() + size(); } + auto reserved() const { return reserved_; } + + auto operator[](difference_type n) const { return *(begin() + n); } + + bool is_local() const { return segment_index_ == default_comm().rank(); } + +private: + DSM *dsm_ = nullptr; + std::size_t segment_index_; + std::size_t size_; + std::size_t reserved_; +}; // csr_row_segment + +} // namespace dr::mp + +namespace std { +template +struct tuple_size> + : std::integral_constant {}; + +template +struct tuple_element> + : tuple_element, + typename DSM::elem_type>> {}; + +} // namespace std diff --git a/include/dr/mp/sycl_support.hpp b/include/dr/mp/sycl_support.hpp index 791999fe5c..33d34159a8 100644 --- a/include/dr/mp/sycl_support.hpp +++ b/include/dr/mp/sycl_support.hpp @@ -17,6 +17,22 @@ sycl::queue &sycl_queue(); namespace dr::mp::__detail { +// sometimes we only want to dereference iterator inside SYCL +template auto sycl_get_deref(T v) { + using deref_type = std::remove_reference::type; + deref_type temp; + { + sycl::buffer buff(&temp, 1); + sycl_queue() + .submit([&](auto &&h) { + sycl::accessor access(buff, h, sycl::write_only, sycl::no_init); + h.single_task([=](auto i) { access[0] = *v; }); + }) + .wait(); + } + return temp; +} + template T sycl_get(T &v) { T temp; sycl_queue().memcpy(&temp, &v, sizeof(v)).wait(); diff --git a/include/dr/sp/algorithms/matrix/local_gemv.hpp b/include/dr/sp/algorithms/matrix/local_gemv.hpp index 302b14c789..7c3c32baa5 100644 --- a/include/dr/sp/algorithms/matrix/local_gemv.hpp +++ b/include/dr/sp/algorithms/matrix/local_gemv.hpp @@ -19,8 +19,9 @@ namespace __detail { template requires(std::is_same_v, T>) -auto custom_gemv(sycl::queue &q, csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { std::size_t wg = 32; auto event = q.submit([&](auto &&h) { @@ -55,7 +56,8 @@ auto custom_gemv(sycl::queue &q, csr_matrix_view a, Iter b, template requires(std::is_same_v, T>) -auto mkl_gemv(sycl::queue &q, csr_matrix_view a, Iter b, Iter c, +auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, const std::vector &dependencies = {}) { oneapi::mkl::sparse::matrix_handle_t a_handle; @@ -78,8 +80,9 @@ auto mkl_gemv(sycl::queue &q, csr_matrix_view a, Iter b, Iter c, template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { return mkl_gemv(q, a, b, c, dependencies); } @@ -88,8 +91,9 @@ auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { return custom_gemv(q, a, b, c, dependencies); } diff --git a/include/dr/sp/containers/distributed_dense_matrix.hpp b/include/dr/sp/containers/distributed_dense_matrix.hpp index de6b1fa332..0c53534cbf 100644 --- a/include/dr/sp/containers/distributed_dense_matrix.hpp +++ b/include/dr/sp/containers/distributed_dense_matrix.hpp @@ -7,8 +7,8 @@ #include #include +#include #include -#include #include #include #include @@ -25,9 +25,9 @@ template class distributed_dense_matrix_accessor { using scalar_value_type = rng::range_value_t; using scalar_reference = rng::range_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; @@ -138,15 +138,15 @@ template class distributed_dense_matrix { using size_type = std::size_t; using difference_type = std::ptrdiff_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_reference = rng::range_reference_t< dr::sp::device_vector>>; using const_scalar_reference = rng::range_reference_t< const dr::sp::device_vector>>; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref; using key_type = dr::index<>; diff --git a/include/dr/sp/containers/sequential/dense_matrix.hpp b/include/dr/sp/containers/sequential/dense_matrix.hpp index 5e4ccb8723..58d77c7ecd 100644 --- a/include/dr/sp/containers/sequential/dense_matrix.hpp +++ b/include/dr/sp/containers/sequential/dense_matrix.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -26,7 +26,7 @@ class dense_matrix { using scalar_pointer = typename std::allocator_traits::pointer; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using key_type = dr::index<>; using map_type = T; diff --git a/include/dr/sp/containers/sparse_matrix.hpp b/include/dr/sp/containers/sparse_matrix.hpp index a8184633cb..83adb869c0 100644 --- a/include/dr/sp/containers/sparse_matrix.hpp +++ b/include/dr/sp/containers/sparse_matrix.hpp @@ -4,15 +4,15 @@ #pragma once +#include #include +#include #include -#include #include #include #include #include -#include -#include +#include #include namespace dr::sp { @@ -137,19 +137,19 @@ template class sparse_matrix { using size_type = std::size_t; using difference_type = std::ptrdiff_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_reference = rng::range_reference_t< dr::sp::device_vector>>; using const_scalar_reference = rng::range_reference_t< const dr::sp::device_vector>>; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref; using key_type = dr::index; - using segment_type = dr::sp::csr_matrix_view< + using segment_type = dr::views::csr_matrix_view< T, I, rng::iterator_t>>, rng::iterator_t>>>; @@ -210,7 +210,7 @@ template class sparse_matrix { // in `gemv_benchmark`. I believe this is a SYCL bug. template auto copy_tile_async(key_type tile_index, - csr_matrix_view tile_view) { + dr::views::csr_matrix_view tile_view) { std::size_t tile_idx = tile_index[0] * grid_shape_[1] + tile_index[1]; auto &&values = values_[tile_idx]; auto &&colind = colind_[tile_idx]; @@ -250,7 +250,7 @@ template class sparse_matrix { template void copy_tile(key_type tile_index, - csr_matrix_view tile_view) { + dr::views::csr_matrix_view tile_view) { copy_tile_async(tile_index, tile_view).wait(); } diff --git a/include/dr/sp/util/generate_random.hpp b/include/dr/sp/util/generate_random.hpp deleted file mode 100644 index a284ce927a..0000000000 --- a/include/dr/sp/util/generate_random.hpp +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-FileCopyrightText: Intel Corporation -// -// SPDX-License-Identifier: BSD-3-Clause - -#pragma once - -#include -#include -#include -#include - -namespace dr::sp { - -namespace { - -template struct uniform_distribution { - using type = std::uniform_int_distribution; -}; - -template struct uniform_distribution { - using type = std::uniform_real_distribution; -}; - -template -using uniform_distribution_t = typename uniform_distribution::type; - -} // namespace - -template -auto generate_random_csr(dr::index shape, double density = 0.01, - unsigned int seed = 0) { - - assert(density >= 0.0 && density < 1.0); - - std::map, T> tuples; - - std::size_t nnz = density * shape[0] * shape[1]; - - std::mt19937 gen(seed); - std::uniform_int_distribution row(0, shape[0] - 1); - std::uniform_int_distribution column(0, shape[1] - 1); - - uniform_distribution_t value_gen(0, 1); - - while (tuples.size() < nnz) { - auto i = row(gen); - auto j = column(gen); - if (tuples.find({i, j}) == tuples.end()) { - T value = value_gen(gen); - tuples.insert({{i, j}, value}); - } - } - - T *values = new T[nnz]; - I *rowptr = new I[shape[0] + 1]; - I *colind = new I[nnz]; - - rowptr[0] = 0; - - std::size_t r = 0; - std::size_t c = 0; - for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { - auto &&[index, value] = *iter; - auto &&[i, j] = index; - - values[c] = value; - colind[c] = j; - - while (r < i) { - if (r + 1 > shape[0]) { - // TODO: exception? - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - } - rowptr[r + 1] = c; - r++; - } - c++; - - if (c > nnz) { - // TODO: exception? - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - } - } - - for (; r < shape[0]; r++) { - rowptr[r + 1] = nnz; - } - - return csr_matrix_view(values, rowptr, colind, shape, nnz, 0); -} - -} // namespace dr::sp diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index d34fd132f7..21561b52c4 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -12,220 +12,17 @@ #include #include -#include -#include +#include +#include namespace dr::sp { -namespace __detail { - -// Preconditions: -// 1) `tuples` sorted by row, column -// 2) `tuples` has shape `shape` -// 3) `tuples` has `nnz` elements -template -auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, - Allocator &&allocator) { - auto &&[index, v] = *tuples.begin(); - auto &&[i, j] = index; - - using T = std::remove_reference_t; - using I = std::remove_reference_t; - - typename std::allocator_traits::template rebind_alloc - i_allocator(allocator); - - T *values = allocator.allocate(nnz); - I *rowptr = i_allocator.allocate(shape[0] + 1); - I *colind = i_allocator.allocate(nnz); - - rowptr[0] = 0; - - std::size_t r = 0; - std::size_t c = 0; - for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { - auto &&[index, value] = *iter; - auto &&[i, j] = index; - - values[c] = value; - colind[c] = j; - - while (r < i) { - assert(r + 1 <= shape[0]); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - rowptr[r + 1] = c; - r++; - } - c++; - - assert(c <= nnz); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - } - - for (; r < shape[0]; r++) { - rowptr[r + 1] = nnz; - } - - return csr_matrix_view(values, rowptr, colind, - dr::index(shape[0], shape[1]), nnz, 0); -} - -/// Read in the Matrix Market file at location `file_path` and a return -/// a coo_matrix data structure with its contents. -template -inline coo_matrix mmread(std::string file_path, bool one_indexed = true) { - using size_type = std::size_t; - - std::ifstream f; - - f.open(file_path.c_str()); - - if (!f.is_open()) { - // TODO better choice of exception. - throw std::runtime_error("mmread: cannot open " + file_path); - } - - std::string buf; - - // Make sure the file is matrix market matrix, coordinate, and check whether - // it is symmetric. If the matrix is symmetric, non-diagonal elements will - // be inserted in both (i, j) and (j, i). Error out if skew-symmetric or - // Hermitian. - std::getline(f, buf); - std::istringstream ss(buf); - std::string item; - ss >> item; - if (item != "%%MatrixMarket") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - ss >> item; - if (item != "matrix") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - ss >> item; - if (item != "coordinate") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - bool pattern; - ss >> item; - if (item == "pattern") { - pattern = true; - } else { - pattern = false; - } - // TODO: do something with real vs. integer vs. pattern? - ss >> item; - bool symmetric; - if (item == "general") { - symmetric = false; - } else if (item == "symmetric") { - symmetric = true; - } else { - throw std::runtime_error(file_path + " has an unsupported matrix type"); - } - - bool outOfComments = false; - while (!outOfComments) { - std::getline(f, buf); - - if (buf[0] != '%') { - outOfComments = true; - } - } - - I m, n, nnz; - // std::istringstream ss(buf); - ss.clear(); - ss.str(buf); - ss >> m >> n >> nnz; - - // NOTE for symmetric matrices: `nnz` holds the number of stored values in - // the matrix market file, while `matrix.nnz_` will hold the total number of - // stored values (including "mirrored" symmetric values). - coo_matrix matrix({m, n}); - if (symmetric) { - matrix.reserve(2 * nnz); - } else { - matrix.reserve(nnz); - } - - size_type c = 0; - while (std::getline(f, buf)) { - I i, j; - T v; - std::istringstream ss(buf); - if (!pattern) { - ss >> i >> j >> v; - } else { - ss >> i >> j; - v = T(1); - } - if (one_indexed) { - i--; - j--; - } - - if (i >= m || j >= n) { - throw std::runtime_error( - "read_MatrixMarket: file has nonzero out of bounds."); - } - - matrix.push_back({{i, j}, v}); - - if (symmetric && i != j) { - matrix.push_back({{j, i}, v}); - } - - c++; - if (c > nnz) { - throw std::runtime_error("read_MatrixMarket: error reading Matrix Market " - "file, file has more nonzeros than reported."); - } - } - - auto sort_fn = [](const auto &a, const auto &b) { - auto &&[a_index, a_value] = a; - auto &&[b_index, b_value] = b; - auto &&[a_i, a_j] = a_index; - auto &&[b_i, b_j] = b_index; - if (a_i < b_i) { - return true; - } else if (a_i == b_i) { - if (a_j < b_j) { - return true; - } - } - return false; - }; - - std::sort(matrix.begin(), matrix.end(), sort_fn); - - f.close(); - - return matrix; -} - -template -void destroy_csr_matrix_view(dr::sp::csr_matrix_view view, - Allocator &&alloc) { - alloc.deallocate(view.values_data(), view.size()); - typename std::allocator_traits::template rebind_alloc i_alloc( - alloc); - i_alloc.deallocate(view.colind_data(), view.size()); - i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1); -} - -} // namespace __detail - template -auto create_distributed(dr::sp::csr_matrix_view local_mat, +auto create_distributed(dr::views::csr_matrix_view local_mat, const matrix_partition &partition) { dr::sp::sparse_matrix a(local_mat.shape(), partition); - std::vector> views; + std::vector> views; std::vector events; views.reserve(a.grid_shape()[0] * a.grid_shape()[1]); @@ -242,7 +39,7 @@ auto create_distributed(dr::sp::csr_matrix_view local_mat, auto submatrix_shape = dr::index(row_bounds[1] - row_bounds[0], column_bounds[1] - column_bounds[0]); - auto copied_submat = __detail::convert_to_csr( + auto copied_submat = dr::__detail::convert_to_csr( local_submat, submatrix_shape, rng::distance(local_submat), std::allocator{}); @@ -255,24 +52,27 @@ auto create_distributed(dr::sp::csr_matrix_view local_mat, __detail::wait(events); for (auto &&view : views) { - __detail::destroy_csr_matrix_view(view, std::allocator{}); + dr::__detail::destroy_csr_matrix_view(view, std::allocator{}); } return a; } +template +auto create_distributed(dr::views::csr_matrix_view local_mat) { + return create_distributed( + local_mat, dr::sp::block_cyclic({dr::sp::tile::div, dr::sp::tile::div}, + {dr::sp::nprocs(), 1})); +} + template auto mmread(std::string file_path, const matrix_partition &partition, bool one_indexed = true) { - auto m = __detail::mmread(file_path, one_indexed); - auto shape = m.shape(); - auto nnz = m.size(); - - auto local_mat = __detail::convert_to_csr(m, shape, nnz, std::allocator{}); + auto local_mat = read_csr(file_path, one_indexed); auto a = create_distributed(local_mat, partition); - __detail::destroy_csr_matrix_view(local_mat, std::allocator{}); + dr::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); return a; } diff --git a/include/dr/sp/views/dense_column_view.hpp b/include/dr/sp/views/dense_column_view.hpp index 627c4faebf..25c2b607ea 100644 --- a/include/dr/sp/views/dense_column_view.hpp +++ b/include/dr/sp/views/dense_column_view.hpp @@ -5,7 +5,7 @@ #pragma once #include -#include +#include #include namespace dr::sp { @@ -17,9 +17,9 @@ template class dense_matrix_column_accessor { using scalar_value_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/dense_matrix_iterator.hpp b/include/dr/sp/views/dense_matrix_iterator.hpp index 85c5274357..8c5e9e929a 100644 --- a/include/dr/sp/views/dense_matrix_iterator.hpp +++ b/include/dr/sp/views/dense_matrix_iterator.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include @@ -22,9 +22,9 @@ template class dense_matrix_accessor { using scalar_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/dense_matrix_view.hpp b/include/dr/sp/views/dense_matrix_view.hpp index bc2bd86d30..f7f2930f2d 100644 --- a/include/dr/sp/views/dense_matrix_view.hpp +++ b/include/dr/sp/views/dense_matrix_view.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -24,7 +24,7 @@ class dense_matrix_view using difference_type = std::ptrdiff_t; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using key_type = dr::index<>; using map_type = T; diff --git a/include/dr/sp/views/dense_row_view.hpp b/include/dr/sp/views/dense_row_view.hpp index d3ccee2c1a..09b008aeee 100644 --- a/include/dr/sp/views/dense_row_view.hpp +++ b/include/dr/sp/views/dense_row_view.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include namespace dr::sp { @@ -18,9 +18,9 @@ template class dense_matrix_row_accessor { using scalar_value_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/csr_matrix_view.hpp b/include/dr/views/csr_matrix_view.hpp similarity index 95% rename from include/dr/sp/views/csr_matrix_view.hpp rename to include/dr/views/csr_matrix_view.hpp index c698cc9c9d..e7bd4ed1f6 100644 --- a/include/dr/sp/views/csr_matrix_view.hpp +++ b/include/dr/views/csr_matrix_view.hpp @@ -5,10 +5,10 @@ #pragma once #include -#include +#include #include -namespace dr::sp { +namespace dr::views { template class csr_matrix_view_accessor { @@ -21,9 +21,9 @@ class csr_matrix_view_accessor { using index_type = I; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; @@ -129,7 +129,7 @@ class csr_matrix_view using difference_type = std::ptrdiff_t; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using scalar_type = T; using index_type = I; @@ -138,7 +138,7 @@ class csr_matrix_view using map_type = T; using iterator = csr_matrix_view_iterator; - + csr_matrix_view() = default; csr_matrix_view(TIter values, IIter rowptr, IIter colind, key_type shape, size_type nnz, size_type rank) : values_(values), rowptr_(rowptr), colind_(colind), shape_(shape), @@ -222,4 +222,4 @@ csr_matrix_view(TIter, IIter, IIter, Args &&...) -> csr_matrix_view, std::iter_value_t, TIter, IIter>; -} // namespace dr::sp +} // namespace dr::views diff --git a/include/dr/views/transform.hpp b/include/dr/views/transform.hpp index b99a07ad32..a379f11f90 100644 --- a/include/dr/views/transform.hpp +++ b/include/dr/views/transform.hpp @@ -53,7 +53,7 @@ class transform_iterator { bool operator<(iterator other) const noexcept { return iter_ < other.iter_; } - bool operator>(iterator other) const noexcept { return iter_ > iter_; } + bool operator>(iterator other) const noexcept { return iter_ > other.iter_; } bool operator<=(iterator other) const noexcept { return iter_ <= other.iter_; diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 32f26d120a..f4f65e4776 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -36,6 +36,7 @@ add_executable( communicator.cpp copy.cpp distributed_vector.cpp + gemv.cpp halo.cpp mdstar.cpp mpsort.cpp @@ -43,6 +44,7 @@ add_executable( stencil.cpp segments.cpp slide_view.cpp + sparse_matrix.cpp wave_kernel.cpp) add_executable( diff --git a/test/gtest/mp/broadcasted_vector.cpp b/test/gtest/mp/broadcasted_vector.cpp new file mode 100644 index 0000000000..a8e27289f0 --- /dev/null +++ b/test/gtest/mp/broadcasted_vector.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +TEST(BroadcastedVector, BroadcastData) { + std::size_t n = 100; + auto rank = dr::mp::default_comm().rank(); + std::vector data(n); + if (rank == 0) { + for (int i = 0; i < n; i++) { + data[i] = i; + } + } + dr::mp::broadcasted_vector broadcasted; + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + + std::vector ref(n); + for (int i = 0; i < n; i++) { + ref[i] = i; + } + + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); + broadcasted.destroy_data(); +} + +TEST(BroadcastedVector, BroadcastDataReuse) { + std::size_t n = 100; + auto rank = dr::mp::default_comm().rank(); + std::vector data(n); + if (rank == 0) { + for (int i = 0; i < n; i++) { + data[i] = i; + } + } + dr::mp::broadcasted_vector broadcasted; + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + + std::vector ref(n); + for (int i = 0; i < n; i++) { + ref[i] = i; + } + + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); + broadcasted.destroy_data(); + EXPECT_EQ(broadcasted.broadcasted_data(), nullptr); + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); + broadcasted.destroy_data(); +} diff --git a/test/gtest/mp/gemv.cpp b/test/gtest/mp/gemv.cpp new file mode 100644 index 0000000000..cf397ad8ed --- /dev/null +++ b/test/gtest/mp/gemv.cpp @@ -0,0 +1,184 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" +auto testMatrixGemv(std::size_t m, std::size_t k, auto &a) { + std::vector base_b(k, 1.f); + std::vector c(m, 0.f); + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(k, 0, base_b, dr::mp::default_comm()); + + dr::mp::gemv(c, a, allocated_b); + + std::vector c_ref(m, 0.f); + + for (auto &&[index, v] : a) { + auto &&[i, k] = index; + + c_ref[i] += v; + } + + EXPECT_TRUE(fp_equal(c_ref, c)) + << fmt::format("Reference:\n {}\nActual:\n {}\n", c_ref, c); +} + +auto testMatrixGemm(std::size_t m, std::size_t n, auto &a, std::size_t width) { + std::vector base_b(n * width); + std::vector c(m * width, 0.f); + + for (auto i = 0; i < n * width; i++) { + base_b[i] = i; + } + + dr::mp::broadcasted_slim_matrix allocated_b; + allocated_b.broadcast_data(n, width, 0, base_b, dr::mp::default_comm()); + + dr::mp::gemv(c, a, allocated_b); + + std::vector c_ref(m * width, 0.f); + + for (auto &&[index, v] : a) { + auto &&[i, k] = index; + + for (auto j = 0; j < width; j++) { + c_ref[i + j * m] += v * base_b[k + j * n]; + } + } + + EXPECT_TRUE(fp_equal(c_ref, c)) + << fmt::format("Reference:\n {}\nActual:\n {}\n", c_ref, c); +} + +TEST(SparseMatrix, GemvRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvRowNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEqNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvRowNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEqNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemmRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmRowNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEqNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmRowNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEqNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp index e663188bbd..c7a00d7323 100644 --- a/test/gtest/mp/reduce.cpp +++ b/test/gtest/mp/reduce.cpp @@ -38,16 +38,15 @@ TYPED_TEST(ReduceMP, RootIterators) { } } -// Example of code that should be compiling, but does not, described in issue -// DRA-192 TYPED_TEST(ReduceMP, NotCompiling) { -// dr::mp::distributed_vector r1(10); - -// auto add = [](auto &&elem) { -// return elem + 1; -// }; - -// auto added = dr::mp::views::transform(r1, add); -// auto min = [](double x, double y) { return std::min(x, y); }; -// auto result = dr::mp::reduce(root, added, 1, min); -// EXPECT_EQ(result, 1); -// } +TYPED_TEST(ReduceMP, TransformReduce) { + Ops1 ops(10); + + auto add = [](auto &&elem) { return elem + 1; }; + + auto added = dr::mp::views::transform(ops.dist_vec, add); + auto min = [](double x, double y) { return std::min(x, y); }; + auto result = dr::mp::reduce(root, added, 1, min); + if (comm_rank == root) { + EXPECT_EQ(result, 1); + } +} diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp new file mode 100644 index 0000000000..0f98b0cb48 --- /dev/null +++ b/test/gtest/mp/sparse_matrix.cpp @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" +auto testMatrixIter(auto &src, auto &matrix) { + EXPECT_TRUE(src.size() == matrix.size()); + std::map, double> entries; + for (auto [index, val] : src) { + entries[{index.first, index.second}] = val; + } + for (auto [index, val] : matrix) { + EXPECT_TRUE((val == entries[{index.first, index.second}])); + } +} + +auto testMatrixReduce(auto &src, auto &matrix) { + EXPECT_TRUE(src.size() == matrix.size()); + long sum = 0; + for (auto [index, val] : src) { + auto [x, y] = index; + sum += (long)(val + x + y); + } + auto transformer = [](auto entry) { + auto [index, val] = entry; + auto [x, y] = index; + return (long)(val + x + y); + }; + auto transformed = dr::transform_view(matrix, transformer); + long reduced = dr::mp::reduce(transformed, 0, std::plus{}); + EXPECT_TRUE((sum == reduced)); +} + +TEST(SparseMatrix, staticAssertEq) { + std::size_t m = 100; + std::size_t k = 100; + using Dist = + dr::mp::csr_eq_distribution; + static_assert(dr::mp::matrix_distibution); + static_assert(dr::mp::vector_multiplicable); + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix a( + csr, 0); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + using Matrix = decltype(a); + static_assert(rng::forward_range); + static_assert(dr::distributed_range); +} + +TEST(SparseMatrix, staticAssertRow) { + std::size_t m = 100; + std::size_t k = 100; + using Dist = + dr::mp::csr_row_distribution; + static_assert(dr::mp::matrix_distibution); + static_assert(dr::mp::vector_multiplicable); + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix a( + csr, 0); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + using Matrix = decltype(a); + static_assert(rng::forward_range); + static_assert(dr::distributed_range); +} + +TEST(SparseMatrix, IterRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixIter(csr, a); +} + +TEST(SparseMatrix, IterEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixIter(csr, a); +} + +TEST(SparseMatrix, ReduceRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixReduce(csr, a); +} + +TEST(SparseMatrix, ReduceEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixReduce(csr, a); +} diff --git a/test/gtest/sp/gemv.cpp b/test/gtest/sp/gemv.cpp index 2b207be926..f573d6bc0e 100644 --- a/test/gtest/sp/gemv.cpp +++ b/test/gtest/sp/gemv.cpp @@ -2,11 +2,11 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "dr/detail/coo_matrix.hpp" #include "xp-tests.hpp" - TEST(SparseMatrix, Gemv) { - std::size_t m = 100; - std::size_t k = 100; + long m = 100; + long k = 100; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -40,9 +40,9 @@ TEST(SparseMatrix, EmptyGemv) { using T = float; using I = int; - dr::sp::__detail::coo_matrix base; - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + dr::__detail::coo_matrix base; + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); @@ -73,8 +73,8 @@ TEST(SparseMatrix, ZeroVector) { } } - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); @@ -94,8 +94,8 @@ TEST(SparseMatrix, ZeroVector) { } TEST(SparseMatrix, NotSquareMatrix) { - std::size_t m = 10; - std::size_t k = 1000; + long m = 10; + long k = 1000; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -124,8 +124,8 @@ TEST(SparseMatrix, NotSquareMatrix) { } TEST(SparseMatrix, NotSquareMatrixOtherAxis) { - std::size_t m = 1000; - std::size_t k = 10; + long m = 1000; + long k = 10; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -154,8 +154,8 @@ TEST(SparseMatrix, NotSquareMatrixOtherAxis) { } TEST(SparseMatrix, VerySparseMatrix) { - std::size_t m = 100; - std::size_t k = 100; + long m = 100; + long k = 100; dr::sp::sparse_matrix a( {m, k}, 0.001f, diff --git a/test/gtest/sp/sparse.cpp b/test/gtest/sp/sparse.cpp index 2e30fee2a0..bf0f1d7b17 100644 --- a/test/gtest/sp/sparse.cpp +++ b/test/gtest/sp/sparse.cpp @@ -16,8 +16,8 @@ TEST(SparseMatrix, IterationForward) { } std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); int i = 0; @@ -48,8 +48,8 @@ TEST(SparseMatrix, IterationReverse) { } std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); int i = base.size();