From 00f1e39c3225ebdeccebd20f2a1769ad0aeb02ab Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 14 Aug 2024 12:07:21 +0200 Subject: [PATCH 01/68] Add initial implementation of sparse matrix in mp --- examples/mp/CMakeLists.txt | 2 + examples/mp/sparse_matrix.cpp | 47 +++ include/dr/mp.hpp | 1 + .../containers/distributed_sparse_matrix.hpp | 219 +++++++++++++ .../dr/mp/containers/distributed_vector.hpp | 8 + .../mp/containers/sparse_matrix_segment.hpp | 288 ++++++++++++++++++ include/dr/sp/containers/matrix_entry.hpp | 8 +- include/dr/sp/util/matrix_io.hpp | 11 +- 8 files changed, 580 insertions(+), 4 deletions(-) create mode 100644 examples/mp/sparse_matrix.cpp create mode 100644 include/dr/mp/containers/distributed_sparse_matrix.hpp create mode 100644 include/dr/mp/containers/sparse_matrix_segment.hpp diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 41cac86ecc..4d51ceb5f3 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -26,6 +26,8 @@ add_mp_example(stencil-1d) add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) +add_mp_example(sparse_matrix) + if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp new file mode 100644 index 0000000000..bd24f07876 --- /dev/null +++ b/examples/mp/sparse_matrix.cpp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +namespace mp = dr::mp; + +int main(int argc, char **argv) { + + if (argc != 2) { + fmt::print("usage: ./gemv_benchmark [matrix market file]\n"); + return 1; + } + + std::string fname(argv[1]); + auto local_data = dr::sp::read_csr(fname); +#ifdef SYCL_LANGUAGE_VERSION + mp::init(sycl::default_selector_v); +#else + mp::init(); +#endif + + { + mp::distributed_sparse_matrix m(local_data); + for (int i = 0; i < dr::mp::default_comm().size(); i++) { + if (dr::mp::default_comm().rank() == i) { + auto csr_iter = local_data.begin(); + for (auto [index, val]: m) { + auto [m, n] = index; + + auto [index_csr, val_csr] = *csr_iter; + auto [m_csr, n_csr] = index; + + assert(m == m_csr && n_csr == n && val == val_csr); + } + } + m.fence(); + } + dr::sp::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } + mp::finalize(); + + return 0; +} diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index 5e9729b16a..353bbb21f6 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -77,4 +77,5 @@ #include #include #include +#include #include diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp new file mode 100644 index 0000000000..27d3eb64e9 --- /dev/null +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -0,0 +1,219 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include + + +namespace dr::mp { + + +template class distributed_sparse_matrix { + +public: + using value_type = dr::sp::matrix_entry; + using elem_type = T; + using index_type = I; + using key_type = dr::index; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using backend_type = BackendT; + + class iterator { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = typename distributed_sparse_matrix::value_type; + using difference_type = typename distributed_sparse_matrix::difference_type; + + iterator() {} + iterator(const distributed_sparse_matrix *parent, difference_type offset) + : parent_(parent), offset_(offset) {} + + auto operator+(difference_type n) const { + return iterator(parent_, offset_ + n); + } + friend auto operator+(difference_type n, const iterator &other) { + return other + n; + } + auto operator-(difference_type n) const { + return iterator(parent_, offset_ - n); + } + auto operator-(iterator other) const { return offset_ - other.offset_; } + + auto &operator+=(difference_type n) { + offset_ += n; + return *this; + } + auto &operator-=(difference_type n) { + offset_ -= n; + return *this; + } + auto &operator++() { + offset_++; + return *this; + } + auto operator++(int) { + auto old = *this; + offset_++; + return old; + } + auto &operator--() { + offset_--; + return *this; + } + auto operator--(int) { + auto old = *this; + offset_--; + return old; + } + + bool operator==(iterator other) const { + if (parent_ == nullptr || other.parent_ == nullptr) { + return false; + } else { + return offset_ == other.offset_; + } + } + auto operator<=>(iterator other) const { + assert(parent_ == other.parent_); + return offset_ <=> other.offset_; + } + + auto operator*() const { + auto segment_size = parent_->segment_size_; + return parent_ + ->segments()[offset_ / segment_size][offset_ % segment_size]; + } + auto operator[](difference_type n) const { return *(*this + n); } + + auto local() { + auto segment_size = parent_->segment_size_; + return (parent_->segments()[offset_ / segment_size].begin() + + offset_ % segment_size) + .local(); + } + + auto segments() { + return dr::__detail::drop_segments(parent_->segments(), offset_); + } + + private: + const distributed_sparse_matrix *parent_ = nullptr; + difference_type offset_; + }; + + distributed_sparse_matrix(const distributed_sparse_matrix &) = delete; + distributed_sparse_matrix &operator=(const distributed_sparse_matrix &) = delete; + distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } + + /// Constructor + distributed_sparse_matrix(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()) { + init(csr_view, dist); + } + + ~distributed_sparse_matrix() { + if (!finalized()) { + fence(); + if (rows_data_ != nullptr) { + rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); + } + + // delete halo_; TODO + } + } + + /// Returns iterator to beginning + auto begin() const { return iterator(this, 0); } + /// Returns iterator to end + auto end() const { return begin() + nnz_; } + + /// Returns size + auto size() const { return nnz_; } + + auto shape() const { return shape_; } + /// Returns reference using index + auto operator[](difference_type n) const { return *(begin() + n); } +// auto &halo() const { return *halo_; } TODO + + auto segments() const { return rng::views::all(segments_); } + + void fence() { + rows_backend_.fence(); // it does not matter which backend we choose, since all of them share comm + } + +private: + + friend dsm_segment_iterator; + std::size_t get_row_size(std::size_t rank) { + std::size_t start_index = row_offsets_[rank]; + std::size_t end_index = nnz_; + if (rank + 1 < row_offsets_.size()) { + end_index = row_offsets_[rank + 1]; + } + return end_index - start_index; + } + + void init(dr::sp::csr_matrix_view csr_view, auto dist) { + nnz_ = csr_view.size(); + distribution_ = dist; + shape_ = csr_view.shape(); + // determine the distribution of data + // auto hb = dist.halo(); + std::size_t gran = dist.granularity(); + // TODO: make this an error that is reported back to user + assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + + + auto rank = rows_backend_.getrank(); + vals_data_ = std::make_shared>(nnz_); + cols_data_ = std::make_shared>(nnz_); + + dr::mp::copy(std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); + dr::mp::copy(std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); + + assert(*csr_view.rowptr_data() == 0); + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = vals_data_->get_segment_offset(i); + auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; + row_offsets_.push_back(lower_limit); + } + + auto last_index = vals_data_->get_segment_offset(rank + 1) - 1; + + auto lower_limit = row_offsets_[rank]; + auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); + row_size_ = higher_limit - lower_limit; + + rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + higher_limit, rows_data_); + std::size_t segment_index = 0; + segment_size_ = vals_data_->segment_size(); + assert(segment_size_ == cols_data_->segment_size()); + for (std::size_t i = 0; i < nnz_; i += segment_size_) { + segments_.emplace_back(this, segment_index++, + std::min(segment_size_, nnz_ - i), segment_size_); + } + + fence(); + } + + + std::size_t segment_size_ = 0; + std::size_t row_size_ = 0; + std::vector row_offsets_; + + index_type *rows_data_ = nullptr; + BackendT rows_backend_; + + distribution distribution_; + dr::index shape_; + std::size_t nnz_; + std::vector> segments_; + std::shared_ptr> vals_data_; + std::shared_ptr> cols_data_; +}; +} \ No newline at end of file diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 2611963064..9bf8349678 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,6 +276,14 @@ template class distributed_vector { void fence() { backend.fence(); } + auto segment_size() const { + return segment_size_; + } + + auto get_segment_offset(std::size_t segment_id) const { + return segment_id * segment_size_; + } + private: void init(auto size, auto dist) { size_ = size; diff --git a/include/dr/mp/containers/sparse_matrix_segment.hpp b/include/dr/mp/containers/sparse_matrix_segment.hpp new file mode 100644 index 0000000000..85a23df43d --- /dev/null +++ b/include/dr/mp/containers/sparse_matrix_segment.hpp @@ -0,0 +1,288 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template class dsm_segment_iterator; + +template class dsm_segment_reference { + using iterator = dsm_segment_iterator; + +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + + dsm_segment_reference(const iterator it) : iterator_(it) {} + + operator value_type() const { return iterator_.get(); } + operator std::pair, elem_type>() const { + return iterator_.get(); + } + + template auto get() const noexcept { + if constexpr (Index == 0) { + return iterator_.get_index(); + } + if constexpr (Index == 1) { + return iterator_.get_value(); + } + } + + auto operator=(const value_type &value) const { + iterator_.put(value); + return *this; + } + auto operator=(const dsm_segment_reference &other) const { + *this = value_type(other); + return *this; + } + auto operator&() const { return iterator_; } + +private: + const iterator iterator_; +}; // dsm_segment_reference + +template class dsm_segment_iterator { +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + using size_type = typename DSM::size_type; + using difference_type = typename DSM::difference_type; + + dsm_segment_iterator() = default; + dsm_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + dsm_ = dsm; + segment_index_ = segment_index; + index_ = index; + } + + auto operator<=>(const dsm_segment_iterator &other) const noexcept { + // assertion below checks against compare dereferenceable iterator to a + // singular iterator and against attempt to compare iterators from different + // sequences like _Safe_iterator does + assert(dsm_ == other.dsm_); + return segment_index_ == other.segment_index_ + ? index_ <=> other.index_ + : segment_index_ <=> other.segment_index_; + } + + // Comparison + bool operator==(const dsm_segment_iterator &other) const noexcept { + return (*this <=> other) == 0; + } + + // Only this arithmetic manipulate internal state + auto &operator+=(difference_type n) { + assert(dsm_ != nullptr); + assert(n >= 0 || static_cast(index_) >= -n); + index_ += n; + return *this; + } + + auto &operator-=(difference_type n) { return *this += (-n); } + + difference_type operator-(const dsm_segment_iterator &other) const noexcept { + assert(dsm_ != nullptr && dsm_ == other.dsm_); + assert(index_ >= other.index_); + return index_ - other.index_; + } + + // prefix + auto &operator++() { + *this += 1; + return *this; + } + auto &operator--() { + *this -= 1; + return *this; + } + + // postfix + auto operator++(int) { + auto prev = *this; + *this += 1; + return prev; + } + auto operator--(int) { + auto prev = *this; + *this -= 1; + return prev; + } + + auto operator+(difference_type n) const { + auto p = *this; + p += n; + return p; + } + auto operator-(difference_type n) const { + auto p = *this; + p -= n; + return p; + } + + // When *this is not first in the expression + friend auto operator+(difference_type n, const dsm_segment_iterator &other) { + return other + n; + } + + // dereference + auto operator*() const { + assert(dsm_ != nullptr); + return dsm_segment_reference{*this}; + } + auto operator[](difference_type n) const { + assert(dsm_ != nullptr); + return *(*this + n); + } + + void get(value_type *dst, std::size_t size) const { + auto elems = new elem_type[size]; + auto indexes = new dr::index[size]; + get_value(elems, size); + get_index(indexes, size); + for (std::size_t i = 0; i < size; i++) { + *(dst + i) = {indexes[i], elems[i]}; + } + } + + value_type get() const { + value_type val; + get(&val, 1); + return val; + } + + void get_value(elem_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_).get(dst, size); + } + + elem_type get_value() const { + elem_type val; + get_value(&val, 1); + return val; + } + + void get_index(dr::index *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + auto col_data = new index_type[size]; + (dsm_->cols_data_->segments()[segment_index_].begin() + index_).get(col_data, size); + index_type *rows; + std::size_t rows_length = dsm_->get_row_size(segment_index_); + + if (rank() == dsm_->rows_backend_.getrank()) { + rows = dsm_->rows_data_; + } + else { + rows = new index_type[rows_length]; + dsm_->rows_backend_.getmem(rows, 0, rows_length * sizeof(index_type), segment_index_); + } + auto position = dsm_->cols_data_->get_segment_offset(segment_index_) + index_; + auto rows_iter = rows + 1; + auto cols_iter = col_data; + auto iter = dst; + std::size_t current_row = dsm_->row_offsets_[segment_index_]; + std::size_t last_row = current_row + rows_length - 1; + for (int i = 0; i < size; i++) { + while (current_row < last_row && *rows_iter <= position + i ) { + rows_iter++; + current_row++; + } + iter->first = current_row; + iter->second = *cols_iter; + cols_iter++; + iter++; + } + if (rank() != dsm_->rows_backend_.getrank()) { + delete[] rows; + } + delete[] col_data; + + } + + dr::index get_index() const { + dr::index val; + get_index(&val, 1); + return val; + } + + void put(const value_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_).put(dst, size); + } + + void put(const value_type &value) const { put(&value, 1); } + + auto rank() const { + assert(dsm_ != nullptr); + return segment_index_; + } + + auto segments() const { + assert(dsm_ != nullptr); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); + } + +private: + // all fields need to be initialized by default ctor so every default + // constructed iter is equal to any other default constructed iter + DSM *dsm_ = nullptr; + std::size_t segment_index_ = 0; + std::size_t index_ = 0; + std::size_t segment_size_ = 0; +}; // dsm_segment_iterator + +template class dsm_segment { +private: + using iterator = dsm_segment_iterator; + +public: + using difference_type = std::ptrdiff_t; + dsm_segment() = default; + dsm_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + std::size_t reserved) { + dsm_ = dsm; + segment_index_ = segment_index; + size_ = size; + reserved_ = reserved; + assert(dsm_ != nullptr); + } + + auto size() const { + assert(dsm_ != nullptr); + return size_; + } + + auto begin() const { return iterator(dsm_, segment_index_, 0); } + auto end() const { return begin() + size(); } + auto reserved() const { return reserved_; } + + auto operator[](difference_type n) const { return *(begin() + n); } + + bool is_local() const { return segment_index_ == default_comm().rank(); } + +private: + DSM *dsm_ = nullptr; + std::size_t segment_index_; + std::size_t size_; + std::size_t reserved_; +}; // dsm_segment + +} // namespace dr::mp + +namespace std { + template + struct tuple_size> : std::integral_constant {}; + + template + struct tuple_element> + : tuple_element, typename DSM::elem_type>> {}; + +} // namespace std diff --git a/include/dr/sp/containers/matrix_entry.hpp b/include/dr/sp/containers/matrix_entry.hpp index b5d2885499..f9fceba8ad 100644 --- a/include/dr/sp/containers/matrix_entry.hpp +++ b/include/dr/sp/containers/matrix_entry.hpp @@ -11,7 +11,12 @@ #include namespace dr::sp { - +template + concept getable = requires(T x) + { + std::get<0>(x); + std::get<1>(x); + }; template class matrix_entry { public: using index_type = I; @@ -28,6 +33,7 @@ template class matrix_entry { : index_(index), value_(std::forward(value)) {} template + requires(getable) matrix_entry(Entry &&entry) : index_(std::get<0>(entry)), value_(std::get<1>(entry)) {} diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index d0403535e3..cf26fcf2d3 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -262,13 +262,18 @@ auto create_distributed(dr::sp::csr_matrix_view local_mat, } template -auto mmread(std::string file_path, const matrix_partition &partition, - bool one_indexed = true) { +auto read_csr(std::string file_path, bool one_indexed = true) { auto m = __detail::mmread(file_path, one_indexed); auto shape = m.shape(); auto nnz = m.size(); - auto local_mat = __detail::convert_to_csr(m, shape, nnz, std::allocator{}); + return __detail::convert_to_csr(m, shape, nnz, std::allocator{}); +} + +template +auto mmread(std::string file_path, const matrix_partition &partition, + bool one_indexed = true) { + auto local_mat = read_csr(file_path, one_indexed); auto a = create_distributed(local_mat, partition); From 36d7f387f955229c924d042b6671c8ab20a22582 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 14 Aug 2024 16:54:19 +0200 Subject: [PATCH 02/68] Fixed row shape calculation --- examples/mp/sparse_matrix.cpp | 13 ++++++++--- .../containers/distributed_sparse_matrix.hpp | 22 +++++++++---------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index bd24f07876..a778bba9fc 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -28,13 +28,20 @@ int main(int argc, char **argv) { for (int i = 0; i < dr::mp::default_comm().size(); i++) { if (dr::mp::default_comm().rank() == i) { auto csr_iter = local_data.begin(); + int j = 0; + fmt::print("{}\n", i); for (auto [index, val]: m) { auto [m, n] = index; auto [index_csr, val_csr] = *csr_iter; - auto [m_csr, n_csr] = index; - - assert(m == m_csr && n_csr == n && val == val_csr); + auto [m_csr, n_csr] = index_csr; + auto check = m == m_csr && n_csr == n && val == val_csr; + if (!check) { + fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); + } + assert(check); + csr_iter++; + j++; } } m.fence(); diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 27d3eb64e9..f80c2bb8ca 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -147,12 +147,7 @@ template class distributed friend dsm_segment_iterator; std::size_t get_row_size(std::size_t rank) { - std::size_t start_index = row_offsets_[rank]; - std::size_t end_index = nnz_; - if (rank + 1 < row_offsets_.size()) { - end_index = row_offsets_[rank + 1]; - } - return end_index - start_index; + return row_sizes_[rank]; } void init(dr::sp::csr_matrix_view csr_view, auto dist) { @@ -178,18 +173,21 @@ template class distributed assert(*csr_view.rowptr_data() == 0); for (int i = 0; i < default_comm().size(); i++) { auto first_index = vals_data_->get_segment_offset(i); + auto last_index = vals_data_->get_segment_offset(i + 1) - 1; auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; + auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); row_offsets_.push_back(lower_limit); + row_sizes_.push_back(higher_limit - lower_limit); } - auto last_index = vals_data_->get_segment_offset(rank + 1) - 1; - auto lower_limit = row_offsets_[rank]; - auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); - row_size_ = higher_limit - lower_limit; + row_size_ = row_sizes_[rank]; + if (row_size_ != get_row_size(rank)) { + fmt::print("hmmmm? {} {} {} {}\n", rank, lower_limit, row_size_, get_row_size(rank)); + } rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); - std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + higher_limit, rows_data_); + std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); std::size_t segment_index = 0; segment_size_ = vals_data_->segment_size(); assert(segment_size_ == cols_data_->segment_size()); @@ -205,6 +203,8 @@ template class distributed std::size_t segment_size_ = 0; std::size_t row_size_ = 0; std::vector row_offsets_; + std::vector row_sizes_; + index_type *rows_data_ = nullptr; BackendT rows_backend_; From dd57bb191fb9b266de2e859c5e9b60d4b0ccb373 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 19 Aug 2024 13:05:37 +0200 Subject: [PATCH 03/68] Extract matrix format from matrix implementation --- examples/mp/sparse_matrix.cpp | 3 +- .../containers/distributed_sparse_matrix.hpp | 130 ++++-------------- .../csr_matrix_distribution.hpp | 122 ++++++++++++++++ .../csr_matrix_segment.hpp} | 52 ++++--- include/dr/sp/views/csr_matrix_view.hpp | 2 +- 5 files changed, 180 insertions(+), 129 deletions(-) create mode 100644 include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp rename include/dr/mp/containers/{sparse_matrix_segment.hpp => matrix_formats/csr_matrix_segment.hpp} (84%) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index a778bba9fc..d1cf4216a5 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -25,11 +25,12 @@ int main(int argc, char **argv) { { mp::distributed_sparse_matrix m(local_data); + fmt::print("{}\n", m.size()); for (int i = 0; i < dr::mp::default_comm().size(); i++) { if (dr::mp::default_comm().rank() == i) { auto csr_iter = local_data.begin(); int j = 0; - fmt::print("{}\n", i); + // fmt::print("{}\n", i); for (auto [index, val]: m) { auto [m, n] = index; diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index f80c2bb8ca..51ae4a2ffb 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -2,15 +2,28 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once -#include +#include #include #include namespace dr::mp { - - -template class distributed_sparse_matrix { +template +concept matrix_distibution = + requires(T t) { + {t.fence()} -> std::same_as; + { t.segments() } -> rng::random_access_range; + {t.shape().first} -> std::convertible_to; + {t.shape().second} -> std::convertible_to; + {t.nnz()} -> std::same_as; + {t.get_segment_from_offset(int())} -> std::same_as; + {t.get_id_in_segment(int())} -> std::same_as; + T(dr::sp::csr_matrix_view(), distribution()); + }; + +template > +requires(matrix_distibution) +class distributed_sparse_matrix { public: using value_type = dr::sp::matrix_entry; @@ -82,17 +95,16 @@ template class distributed } auto operator*() const { - auto segment_size = parent_->segment_size_; - return parent_ - ->segments()[offset_ / segment_size][offset_ % segment_size]; + auto segment_id = parent_->distribution_.get_segment_from_offset(offset_); + auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_); + return parent_->segments()[segment_id][id_in_segment]; } auto operator[](difference_type n) const { return *(*this + n); } auto local() { - auto segment_size = parent_->segment_size_; - return (parent_->segments()[offset_ / segment_size].begin() + - offset_ % segment_size) - .local(); + auto segment_id = parent_->distribution_.get_segment_from_offset(offset_); + auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_); + return (parent_->segments()[segment_id].begin() + id_in_segment).local(); } auto segments() { @@ -109,111 +121,29 @@ template class distributed distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } /// Constructor - distributed_sparse_matrix(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()) { - init(csr_view, dist); - } - - ~distributed_sparse_matrix() { - if (!finalized()) { - fence(); - if (rows_data_ != nullptr) { - rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); - } - - // delete halo_; TODO - } - } + distributed_sparse_matrix(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()): distribution_(csr_view, dist) {} /// Returns iterator to beginning auto begin() const { return iterator(this, 0); } /// Returns iterator to end - auto end() const { return begin() + nnz_; } + auto end() const { return begin() + distribution_.nnz(); } /// Returns size - auto size() const { return nnz_; } + auto size() const { return distribution_.nnz(); } - auto shape() const { return shape_; } + auto shape() const { return distribution_.shape(); } /// Returns reference using index auto operator[](difference_type n) const { return *(begin() + n); } // auto &halo() const { return *halo_; } TODO - auto segments() const { return rng::views::all(segments_); } + auto segments() const { return distribution_.segments(); } void fence() { - rows_backend_.fence(); // it does not matter which backend we choose, since all of them share comm + distribution_.fence(); } private: + MatrixDistrT distribution_; - friend dsm_segment_iterator; - std::size_t get_row_size(std::size_t rank) { - return row_sizes_[rank]; - } - - void init(dr::sp::csr_matrix_view csr_view, auto dist) { - nnz_ = csr_view.size(); - distribution_ = dist; - shape_ = csr_view.shape(); - // determine the distribution of data - // auto hb = dist.halo(); - std::size_t gran = dist.granularity(); - // TODO: make this an error that is reported back to user - assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); - - - auto rank = rows_backend_.getrank(); - vals_data_ = std::make_shared>(nnz_); - cols_data_ = std::make_shared>(nnz_); - - dr::mp::copy(std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); - dr::mp::copy(std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); - - assert(*csr_view.rowptr_data() == 0); - for (int i = 0; i < default_comm().size(); i++) { - auto first_index = vals_data_->get_segment_offset(i); - auto last_index = vals_data_->get_segment_offset(i + 1) - 1; - auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; - auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); - row_offsets_.push_back(lower_limit); - row_sizes_.push_back(higher_limit - lower_limit); - } - - auto lower_limit = row_offsets_[rank]; - row_size_ = row_sizes_[rank]; - if (row_size_ != get_row_size(rank)) { - fmt::print("hmmmm? {} {} {} {}\n", rank, lower_limit, row_size_, get_row_size(rank)); - } - - rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); - std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); - std::size_t segment_index = 0; - segment_size_ = vals_data_->segment_size(); - assert(segment_size_ == cols_data_->segment_size()); - for (std::size_t i = 0; i < nnz_; i += segment_size_) { - segments_.emplace_back(this, segment_index++, - std::min(segment_size_, nnz_ - i), segment_size_); - } - - fence(); - } - - - std::size_t segment_size_ = 0; - std::size_t row_size_ = 0; - std::vector row_offsets_; - std::vector row_sizes_; - - - index_type *rows_data_ = nullptr; - BackendT rows_backend_; - - distribution distribution_; - dr::index shape_; - std::size_t nnz_; - std::vector> segments_; - std::shared_ptr> vals_data_; - std::shared_ptr> cols_data_; }; } \ No newline at end of file diff --git a/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp new file mode 100644 index 0000000000..dd7d58c1f5 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include + +namespace dr::mp { + +template +class csr_matrix_distribution { +public: + using value_type = dr::sp::matrix_entry; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; + + csr_matrix_distribution(const csr_matrix_distribution &) = delete; + csr_matrix_distribution &operator=(const csr_matrix_distribution &) = delete; + csr_matrix_distribution(csr_matrix_distribution &&) { assert(false); } + + /// Constructor + csr_matrix_distribution(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()) { + init(csr_view, dist); + } + + ~csr_matrix_distribution() { + if (!finalized()) { + fence(); + if (rows_data_ != nullptr) { + rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); + } + + // delete halo_; TODO + } + } + std::size_t get_id_in_segment(std::size_t offset) const { + return offset % segment_size_; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + return offset / segment_size_; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const {return nnz_;} + auto shape() const {return shape_;} + void fence() { + rows_backend_.fence(); + } +private: + friend csr_segment_iterator; + std::size_t get_row_size(std::size_t rank) { + return row_sizes_[rank]; + } + + void init(dr::sp::csr_matrix_view csr_view, auto dist) { + nnz_ = csr_view.size(); + distribution_ = dist; + shape_ = csr_view.shape(); + // determine the distribution of data + // auto hb = dist.halo(); + std::size_t gran = dist.granularity(); + // TODO: make this an error that is reported back to user + assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + + + auto rank = rows_backend_.getrank(); + vals_data_ = std::make_shared>(nnz_); + cols_data_ = std::make_shared>(nnz_); + + dr::mp::copy(std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); + dr::mp::copy(std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); + + assert(*csr_view.rowptr_data() == 0); + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = vals_data_->get_segment_offset(i); + auto last_index = vals_data_->get_segment_offset(i + 1) - 1; + auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; + auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); + row_offsets_.push_back(lower_limit); + row_sizes_.push_back(higher_limit - lower_limit); + } + + auto lower_limit = row_offsets_[rank]; + row_size_ = row_sizes_[rank]; + if (row_size_ != get_row_size(rank)) { + fmt::print("hmmmm? {} {} {} {}\n", rank, lower_limit, row_size_, get_row_size(rank)); + } + + rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); + std::size_t segment_index = 0; + segment_size_ = vals_data_->segment_size(); + assert(segment_size_ == cols_data_->segment_size()); + for (std::size_t i = 0; i < nnz_; i += segment_size_) { + segments_.emplace_back(this, segment_index++, + std::min(segment_size_, nnz_ - i), segment_size_); + } + + fence(); + } + + + std::size_t segment_size_ = 0; + std::size_t row_size_ = 0; + std::vector row_offsets_; + std::vector row_sizes_; + + + index_type *rows_data_ = nullptr; + BackendT rows_backend_; + + distribution distribution_; + dr::index shape_; + std::size_t nnz_; + std::vector> segments_; + std::shared_ptr> vals_data_; + std::shared_ptr> cols_data_; +}; +} \ No newline at end of file diff --git a/include/dr/mp/containers/sparse_matrix_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp similarity index 84% rename from include/dr/mp/containers/sparse_matrix_segment.hpp rename to include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp index 85a23df43d..947d65da57 100644 --- a/include/dr/mp/containers/sparse_matrix_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp @@ -6,17 +6,17 @@ namespace dr::mp { -template class dsm_segment_iterator; +template class csr_segment_iterator; -template class dsm_segment_reference { - using iterator = dsm_segment_iterator; +template class csr_segment_reference { + using iterator = csr_segment_iterator; public: using value_type = typename DSM::value_type; using index_type = typename DSM::index_type; using elem_type = typename DSM::elem_type; - dsm_segment_reference(const iterator it) : iterator_(it) {} + csr_segment_reference(const iterator it) : iterator_(it) {} operator value_type() const { return iterator_.get(); } operator std::pair, elem_type>() const { @@ -36,7 +36,7 @@ template class dsm_segment_reference { iterator_.put(value); return *this; } - auto operator=(const dsm_segment_reference &other) const { + auto operator=(const csr_segment_reference &other) const { *this = value_type(other); return *this; } @@ -44,24 +44,23 @@ template class dsm_segment_reference { private: const iterator iterator_; -}; // dsm_segment_reference +}; // csr_segment_reference -template class dsm_segment_iterator { +template class csr_segment_iterator { public: using value_type = typename DSM::value_type; using index_type = typename DSM::index_type; using elem_type = typename DSM::elem_type; - using size_type = typename DSM::size_type; using difference_type = typename DSM::difference_type; - dsm_segment_iterator() = default; - dsm_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + csr_segment_iterator() = default; + csr_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { dsm_ = dsm; segment_index_ = segment_index; index_ = index; } - auto operator<=>(const dsm_segment_iterator &other) const noexcept { + auto operator<=>(const csr_segment_iterator &other) const noexcept { // assertion below checks against compare dereferenceable iterator to a // singular iterator and against attempt to compare iterators from different // sequences like _Safe_iterator does @@ -72,7 +71,7 @@ template class dsm_segment_iterator { } // Comparison - bool operator==(const dsm_segment_iterator &other) const noexcept { + bool operator==(const csr_segment_iterator &other) const noexcept { return (*this <=> other) == 0; } @@ -86,7 +85,7 @@ template class dsm_segment_iterator { auto &operator-=(difference_type n) { return *this += (-n); } - difference_type operator-(const dsm_segment_iterator &other) const noexcept { + difference_type operator-(const csr_segment_iterator &other) const noexcept { assert(dsm_ != nullptr && dsm_ == other.dsm_); assert(index_ >= other.index_); return index_ - other.index_; @@ -126,14 +125,14 @@ template class dsm_segment_iterator { } // When *this is not first in the expression - friend auto operator+(difference_type n, const dsm_segment_iterator &other) { + friend auto operator+(difference_type n, const csr_segment_iterator &other) { return other + n; } // dereference auto operator*() const { assert(dsm_ != nullptr); - return dsm_segment_reference{*this}; + return csr_segment_reference{*this}; } auto operator[](difference_type n) const { assert(dsm_ != nullptr); @@ -158,7 +157,7 @@ template class dsm_segment_iterator { void get_value(elem_type *dst, std::size_t size) const { assert(dsm_ != nullptr); - assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); (dsm_->vals_data_->segments()[segment_index_].begin() + index_).get(dst, size); } @@ -170,7 +169,7 @@ template class dsm_segment_iterator { void get_index(dr::index *dst, std::size_t size) const { assert(dsm_ != nullptr); - assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); auto col_data = new index_type[size]; (dsm_->cols_data_->segments()[segment_index_].begin() + index_).get(col_data, size); index_type *rows; @@ -214,7 +213,7 @@ template class dsm_segment_iterator { void put(const value_type *dst, std::size_t size) const { assert(dsm_ != nullptr); - assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->size()); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); (dsm_->vals_data_->segments()[segment_index_].begin() + index_).put(dst, size); } @@ -236,17 +235,16 @@ template class dsm_segment_iterator { DSM *dsm_ = nullptr; std::size_t segment_index_ = 0; std::size_t index_ = 0; - std::size_t segment_size_ = 0; -}; // dsm_segment_iterator +}; // csr_segment_iterator -template class dsm_segment { +template class csr_segment { private: - using iterator = dsm_segment_iterator; + using iterator = csr_segment_iterator; public: using difference_type = std::ptrdiff_t; - dsm_segment() = default; - dsm_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + csr_segment() = default; + csr_segment(DSM *dsm, std::size_t segment_index, std::size_t size, std::size_t reserved) { dsm_ = dsm; segment_index_ = segment_index; @@ -273,16 +271,16 @@ template class dsm_segment { std::size_t segment_index_; std::size_t size_; std::size_t reserved_; -}; // dsm_segment +}; // csr_segment } // namespace dr::mp namespace std { template - struct tuple_size> : std::integral_constant {}; + struct tuple_size> : std::integral_constant {}; template - struct tuple_element> + struct tuple_element> : tuple_element, typename DSM::elem_type>> {}; } // namespace std diff --git a/include/dr/sp/views/csr_matrix_view.hpp b/include/dr/sp/views/csr_matrix_view.hpp index d0705175be..ecc6350be5 100644 --- a/include/dr/sp/views/csr_matrix_view.hpp +++ b/include/dr/sp/views/csr_matrix_view.hpp @@ -138,7 +138,7 @@ class csr_matrix_view using map_type = T; using iterator = csr_matrix_view_iterator; - + csr_matrix_view() = default; csr_matrix_view(TIter values, IIter rowptr, IIter colind, key_type shape, size_type nnz, size_type rank) : values_(values), rowptr_(rowptr), colind_(colind), shape_(shape), From b84ecc0926a7ebf6d31fa8667497e9e0a47037d2 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 21 Aug 2024 09:08:40 +0200 Subject: [PATCH 04/68] Add initial gemv implementation --- examples/mp/sparse_matrix.cpp | 20 +++++++ include/dr/detail/communicator.hpp | 5 +- include/dr/mp.hpp | 1 + include/dr/mp/algorithms/matrix/gemv.hpp | 60 +++++++++++++++++++ .../containers/distributed_sparse_matrix.hpp | 8 +++ .../dr/mp/containers/distributed_vector.hpp | 4 +- .../csr_matrix_distribution.hpp | 42 ++++++++++++- 7 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 include/dr/mp/algorithms/matrix/gemv.hpp diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index d1cf4216a5..12b4210ee5 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -48,6 +48,26 @@ int main(int argc, char **argv) { m.fence(); } dr::sp::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + + std::vector res(m.shape().first); + std::vector a(m.shape().second); + for (int i = 0; i < a.size(); i++) { + a[i] = i; + } + gemv(0, res, m, a); + + std::vector ref(m.shape().first); + if (dr::mp::default_comm().rank() == 0) { + for (auto [index, val]: m) { + auto [m, n] = index; + ref[m] += n * val; + } + for (int i = 0; i < m.shape().first; i++) { + if (res[i] != ref[i]) { + fmt::print("mismatching outcome {} {}\n", res[i], ref[i]); + } + } + } } mp::finalize(); diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 331253ab63..ceb7141171 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -61,9 +61,10 @@ class communicator { mpi_comm_); } - void gather(const void *src, void *dst, std::size_t count, + template + void gather(const T *src, T *dst, std::size_t count, std::size_t root) const { - MPI_Gather_c(src, count, MPI_BYTE, dst, count, MPI_BYTE, root, mpi_comm_); + MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), MPI_BYTE, root, mpi_comm_); } template diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index 353bbb21f6..dbd19f679d 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp new file mode 100644 index 0000000000..cf1d0c54a9 --- /dev/null +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once +#include +#include +#include +#include +#include + +namespace dr::mp { + +template C, rng::input_range B, typename Backend> //TODO?:, typename MatDistr> +void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { + if (default_comm().rank() == root) { + assert(a.shape().first == res.size()); + assert(a.shape().second == b.size()); + } + // copy b to all machines + auto communicator = default_comm(); + __detail::allocator alloc; + auto broadcasted_b = alloc.allocate(a.shape().second); + if (communicator.rank() == root) { + rng::copy(b.begin(), b.end(), broadcasted_b); + } + communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); + + // multiply b by local segment + auto res_alloc = alloc.allocate(a.shape().first); + a.local_gemv(res_alloc, broadcasted_b); + + // reduce result by adding partial results + if (default_comm().rank() == root) { + auto gathered_res = alloc.allocate(a.shape().first * communicator.size()); + communicator.gather(res_alloc, gathered_res, a.shape().first, root); + rng::fill(res, 0); + for (int i = 0; i < communicator.size(); i++) { + auto row_bounds = a.local_row_bounds(i); + for (int j = row_bounds.first; j < row_bounds.second; j++) { + res[j] += gathered_res[a.shape().first * i + j - row_bounds.first]; + } + } + alloc.deallocate(gathered_res, a.shape().first * communicator.size()); + } + else { + communicator.gather(res_alloc, static_cast(nullptr), a.shape().first, root); + } + alloc.deallocate(broadcasted_b, a.shape().second); + alloc.deallocate(res_alloc, a.shape().first); + // a.fence(); + // if (default_comm().rank() == root) { + // for (int i = 0; i < a.shape().first; i++) { + // fmt::print("Result {} {}\n", i, res[i]); + // } + // } + +} + +} \ No newline at end of file diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 51ae4a2ffb..1db057245c 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -142,6 +142,14 @@ class distributed_sparse_matrix { distribution_.fence(); } + template + auto local_gemv(C &res, A &vals) const { + distribution_.local_gemv(res, vals); + } + + auto local_row_bounds(std::size_t rank) const { + return distribution_.local_row_bounds(rank); + } private: MatrixDistrT distribution_; diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 9bf8349678..79c1845289 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -77,7 +77,7 @@ class MpiBackend { #endif } - std::size_t getrank() { return win_.communicator().rank(); } + std::size_t getrank() const { return win_.communicator().rank(); } void fence() { win_.fence(); } }; @@ -121,7 +121,7 @@ class IshmemBackend { ishmem_putmem(dst, src, datalen, segment_index); } - std::size_t getrank() { + std::size_t getrank() const { auto my_process_segment_index = ishmem_my_pe(); DRLOG("called ishmem_my_pe() -> {}", my_process_segment_index); return my_process_segment_index; diff --git a/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp index dd7d58c1f5..da12c07886 100644 --- a/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp @@ -47,6 +47,38 @@ class csr_matrix_distribution { void fence() { rows_backend_.fence(); } + + template + auto local_gemv(C &res, A &vals) const { + // if (dr::mp::use_sycl()) { + + // } + // else { + auto rank = rows_backend_.getrank(); + auto size = row_sizes_[rank]; + auto row_i = -1; + auto position = segment_size_ * rank; + auto current_row_position = rows_data_[0]; + auto local_vals = dr::mp::local_segment(*vals_data_); + auto local_cols = dr::mp::local_segment(*cols_data_); + + for (int i = 0; i < segment_size_; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = rows_data_[row_i + 1]; + } + res[row_i] += local_vals[i] * vals[local_cols[i]]; + } + + // fmt::print("offset, rank {} {}\n", row_offsets_[ rows_backend_.getrank()], rows_backend_.getrank()); + // for (int i = 0; i < size; i++) { + // fmt::print("ledata, rank, i {} {} {}\n", res[i], rows_backend_.getrank(), i); + // } + // } + } + auto local_row_bounds(std::size_t rank) const { + return std::pair(row_offsets_[rank], row_offsets_[rank] + row_sizes_[rank]); + } private: friend csr_segment_iterator; std::size_t get_row_size(std::size_t rank) { @@ -98,7 +130,15 @@ class csr_matrix_distribution { segments_.emplace_back(this, segment_index++, std::min(segment_size_, nnz_ - i), segment_size_); } - + + // for (int i = 0; i < row_size_; i++) { + // fmt::print("row, i, rank {} {} {}\n", rows_data_[i], i, rank); + // } + // fence(); + // for (int i = 0; i < vals_data_->segments()[rank].size(); i++) { + // fmt::print("val, col, i, rank {} {} {} {}\n", vals_data_->segments()[rank][i], cols_data_->segments()[rank][i],i, rank); + // } + fence(); } From 1c1dad7f1a8108f4bedb70292de58322fa4d6b9f Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 21 Aug 2024 11:16:03 +0200 Subject: [PATCH 05/68] Move matrix related files from sp to general module --- examples/mp/sparse_matrix.cpp | 59 +++-- examples/sp/gemv_benchmark.cpp | 2 +- include/dr/{sp/util => detail}/coo_matrix.hpp | 12 +- .../generate_random_csr.hpp} | 6 +- .../containers => detail}/matrix_entry.hpp | 30 +-- include/dr/detail/matrix_io.hpp | 231 ++++++++++++++++++ include/dr/mp.hpp | 4 + .../containers/distributed_sparse_matrix.hpp | 14 +- ...stribution.hpp => csr_eq_distribution.hpp} | 26 +- ..._matrix_segment.hpp => csr_eq_segment.hpp} | 44 ++-- .../dr/sp/algorithms/matrix/local_gemv.hpp | 8 +- .../containers/distributed_dense_matrix.hpp | 12 +- .../sp/containers/sequential/dense_matrix.hpp | 4 +- include/dr/sp/containers/sparse_matrix.hpp | 18 +- include/dr/sp/util/matrix_io.hpp | 225 +---------------- include/dr/sp/views/dense_column_view.hpp | 6 +- include/dr/sp/views/dense_matrix_iterator.hpp | 6 +- include/dr/sp/views/dense_matrix_view.hpp | 4 +- include/dr/sp/views/dense_row_view.hpp | 6 +- include/dr/{sp => }/views/csr_matrix_view.hpp | 12 +- 20 files changed, 379 insertions(+), 350 deletions(-) rename include/dr/{sp/util => detail}/coo_matrix.hpp (94%) rename include/dr/{sp/util/generate_random.hpp => detail/generate_random_csr.hpp} (93%) rename include/dr/{sp/containers => detail}/matrix_entry.hpp (88%) create mode 100644 include/dr/detail/matrix_io.hpp rename include/dr/mp/containers/matrix_formats/{csr_matrix_distribution.hpp => csr_eq_distribution.hpp} (87%) rename include/dr/mp/containers/matrix_formats/{csr_matrix_segment.hpp => csr_eq_segment.hpp} (83%) rename include/dr/{sp => }/views/csr_matrix_view.hpp (96%) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index 12b4210ee5..8c1cc765e1 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -16,7 +16,7 @@ int main(int argc, char **argv) { } std::string fname(argv[1]); - auto local_data = dr::sp::read_csr(fname); + auto local_data = dr::read_csr(fname); #ifdef SYCL_LANGUAGE_VERSION mp::init(sycl::default_selector_v); #else @@ -24,41 +24,45 @@ int main(int argc, char **argv) { #endif { - mp::distributed_sparse_matrix m(local_data); + mp::distributed_sparse_matrix m(local_data); fmt::print("{}\n", m.size()); - for (int i = 0; i < dr::mp::default_comm().size(); i++) { - if (dr::mp::default_comm().rank() == i) { - auto csr_iter = local_data.begin(); - int j = 0; - // fmt::print("{}\n", i); - for (auto [index, val]: m) { - auto [m, n] = index; + // for (int i = 0; i < dr::mp::default_comm().size(); i++) { + // if (dr::mp::default_comm().rank() == i) { + // auto csr_iter = local_data.begin(); + // int j = 0; + // // fmt::print("{}\n", i); + // for (auto [index, val]: m) { + // auto [m, n] = index; - auto [index_csr, val_csr] = *csr_iter; - auto [m_csr, n_csr] = index_csr; - auto check = m == m_csr && n_csr == n && val == val_csr; - if (!check) { - fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); - } - assert(check); - csr_iter++; - j++; - } - } - m.fence(); - } - dr::sp::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + // auto [index_csr, val_csr] = *csr_iter; + // auto [m_csr, n_csr] = index_csr; + // auto check = m == m_csr && n_csr == n && val == val_csr; + // if (!check) { + // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); + // } + // assert(check); + // csr_iter++; + // j++; + // } + // } + // m.fence(); + // } - std::vector res(m.shape().first); - std::vector a(m.shape().second); + std::vector res(m.shape().first); + std::vector a(m.shape().second); for (int i = 0; i < a.size(); i++) { a[i] = i; } + m.fence(); + fmt::print("gemv started\n"); gemv(0, res, m, a); + m.fence(); + fmt::print("gemv finished\n"); - std::vector ref(m.shape().first); + std::vector ref(m.shape().first); if (dr::mp::default_comm().rank() == 0) { - for (auto [index, val]: m) { + for (auto a : local_data) { + auto [index, val] = a; auto [m, n] = index; ref[m] += n * val; } @@ -69,6 +73,7 @@ int main(int argc, char **argv) { } } } + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); mp::finalize(); return 0; diff --git a/examples/sp/gemv_benchmark.cpp b/examples/sp/gemv_benchmark.cpp index 62b83f6b86..c29182dd60 100644 --- a/examples/sp/gemv_benchmark.cpp +++ b/examples/sp/gemv_benchmark.cpp @@ -252,7 +252,7 @@ int main(int argc, char **argv) { sp::vector> y(local_mat.shape()[1], 0, allocator); - sp::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); + dr::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); sp::csr_matrix_view a_view(values, rowptr, colind, shape, nnz, 0); diff --git a/include/dr/sp/util/coo_matrix.hpp b/include/dr/detail/coo_matrix.hpp similarity index 94% rename from include/dr/sp/util/coo_matrix.hpp rename to include/dr/detail/coo_matrix.hpp index 0e8b871504..1891510f04 100644 --- a/include/dr/sp/util/coo_matrix.hpp +++ b/include/dr/detail/coo_matrix.hpp @@ -4,18 +4,18 @@ #pragma once -#include +#include #include #include -namespace dr::sp { +namespace dr { namespace __detail { template > class coo_matrix { public: - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_type = T; using index_type = I; using size_type = std::size_t; @@ -33,8 +33,8 @@ class coo_matrix { using iterator = typename backend_type::iterator; using const_iterator = typename backend_type::const_iterator; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref, I>; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref, I>; using scalar_reference = T &; @@ -167,4 +167,4 @@ class coo_matrix { } // namespace __detail -} // namespace dr::sp +} // namespace dr diff --git a/include/dr/sp/util/generate_random.hpp b/include/dr/detail/generate_random_csr.hpp similarity index 93% rename from include/dr/sp/util/generate_random.hpp rename to include/dr/detail/generate_random_csr.hpp index a284ce927a..45c175c3a8 100644 --- a/include/dr/sp/util/generate_random.hpp +++ b/include/dr/detail/generate_random_csr.hpp @@ -5,11 +5,11 @@ #pragma once #include -#include +#include #include #include -namespace dr::sp { +namespace dr { namespace { @@ -86,7 +86,7 @@ auto generate_random_csr(dr::index shape, double density = 0.01, rowptr[r + 1] = nnz; } - return csr_matrix_view(values, rowptr, colind, shape, nnz, 0); + return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0); } } // namespace dr::sp diff --git a/include/dr/sp/containers/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp similarity index 88% rename from include/dr/sp/containers/matrix_entry.hpp rename to include/dr/detail/matrix_entry.hpp index f9fceba8ad..e56251f87c 100644 --- a/include/dr/sp/containers/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -10,7 +10,7 @@ #include -namespace dr::sp { +namespace dr { template concept getable = requires(T x) { @@ -91,28 +91,28 @@ template class matrix_entry { map_type value_; }; -} // namespace dr::sp +} // namespace dr namespace std { template requires(!std::is_const_v) -void swap(dr::sp::matrix_entry a, dr::sp::matrix_entry b) { - dr::sp::matrix_entry other = a; +void swap(dr::matrix_entry a, dr::matrix_entry b) { + dr::matrix_entry other = a; a = b; b = other; } template -struct tuple_element> +struct tuple_element> : tuple_element, T>> {}; template -struct tuple_size> : integral_constant {}; +struct tuple_size> : integral_constant {}; } // namespace std -namespace dr::sp { +namespace dr { template class matrix_ref { @@ -125,7 +125,7 @@ class matrix_ref { using scalar_reference = TRef; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; matrix_ref(dr::index index, scalar_reference value) : index_(index), value_(value) {} @@ -189,28 +189,28 @@ class matrix_ref { scalar_reference value_; }; -} // namespace dr::sp +} // namespace dr namespace std { template requires(!std::is_const_v) -void swap(dr::sp::matrix_ref a, dr::sp::matrix_ref b) { - dr::sp::matrix_entry other = a; +void swap(dr::matrix_ref a, dr::matrix_ref b) { + dr::matrix_entry other = a; a = b; b = other; } template -struct tuple_element> +struct tuple_element> : tuple_element, TRef>> {}; template -struct tuple_size> +struct tuple_size> : integral_constant {}; template -inline decltype(auto) get(dr::sp::matrix_ref ref) +inline decltype(auto) get(dr::matrix_ref ref) requires(Index <= 1) { if constexpr (Index == 0) { @@ -222,7 +222,7 @@ inline decltype(auto) get(dr::sp::matrix_ref ref) } template -inline decltype(auto) get(dr::sp::matrix_entry entry) +inline decltype(auto) get(dr::matrix_entry entry) requires(Index <= 1) { if constexpr (Index == 0) { diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp new file mode 100644 index 0000000000..ff429c2b2b --- /dev/null +++ b/include/dr/detail/matrix_io.hpp @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dr { + +namespace __detail { + +// Preconditions: +// 1) `tuples` sorted by row, column +// 2) `tuples` has shape `shape` +// 3) `tuples` has `nnz` elements +template +auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, + Allocator &&allocator) { + auto &&[index, v] = *tuples.begin(); + auto &&[i, j] = index; + + using T = std::remove_reference_t; + using I = std::remove_reference_t; + + typename std::allocator_traits::template rebind_alloc + i_allocator(allocator); + + T *values = allocator.allocate(nnz); + I *rowptr = i_allocator.allocate(shape[0] + 1); + I *colind = i_allocator.allocate(nnz); + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { + auto &&[index, value] = *iter; + auto &&[i, j] = index; + + values[c] = value; + colind[c] = j; + + while (r < i) { + assert(r + 1 <= shape[0]); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + rowptr[r + 1] = c; + r++; + } + c++; + + assert(c <= nnz); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + + for (; r < shape[0]; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, + dr::index(shape[0], shape[1]), nnz, 0); +} + +/// Read in the Matrix Market file at location `file_path` and a return +/// a coo_matrix data structure with its contents. +template +inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed = true) { + using size_type = std::size_t; + + std::ifstream f; + + f.open(file_path.c_str()); + + if (!f.is_open()) { + // TODO better choice of exception. + throw std::runtime_error("mmread: cannot open " + file_path); + } + + std::string buf; + + // Make sure the file is matrix market matrix, coordinate, and check whether + // it is symmetric. If the matrix is symmetric, non-diagonal elements will + // be inserted in both (i, j) and (j, i). Error out if skew-symmetric or + // Hermitian. + std::getline(f, buf); + std::istringstream ss(buf); + std::string item; + ss >> item; + if (item != "%%MatrixMarket") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + ss >> item; + if (item != "matrix") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + ss >> item; + if (item != "coordinate") { + throw std::runtime_error(file_path + + " could not be parsed as a Matrix Market file."); + } + bool pattern; + ss >> item; + if (item == "pattern") { + pattern = true; + } else { + pattern = false; + } + // TODO: do something with real vs. integer vs. pattern? + ss >> item; + bool symmetric; + if (item == "general") { + symmetric = false; + } else if (item == "symmetric") { + symmetric = true; + } else { + throw std::runtime_error(file_path + " has an unsupported matrix type"); + } + + bool outOfComments = false; + while (!outOfComments) { + std::getline(f, buf); + + if (buf[0] != '%') { + outOfComments = true; + } + } + + I m, n, nnz; + // std::istringstream ss(buf); + ss.clear(); + ss.str(buf); + ss >> m >> n >> nnz; + + // NOTE for symmetric matrices: `nnz` holds the number of stored values in + // the matrix market file, while `matrix.nnz_` will hold the total number of + // stored values (including "mirrored" symmetric values). + coo_matrix matrix({m, n}); + if (symmetric) { + matrix.reserve(2 * nnz); + } else { + matrix.reserve(nnz); + } + + size_type c = 0; + while (std::getline(f, buf)) { + I i, j; + T v; + std::istringstream ss(buf); + if (!pattern) { + ss >> i >> j >> v; + } else { + ss >> i >> j; + v = T(1); + } + if (one_indexed) { + i--; + j--; + } + + if (i >= m || j >= n) { + throw std::runtime_error( + "read_MatrixMarket: file has nonzero out of bounds."); + } + + matrix.push_back({{i, j}, v}); + + if (symmetric && i != j) { + matrix.push_back({{j, i}, v}); + } + + c++; + if (c > nnz) { + throw std::runtime_error("read_MatrixMarket: error reading Matrix Market " + "file, file has more nonzeros than reported."); + } + } + + auto sort_fn = [](const auto &a, const auto &b) { + auto &&[a_index, a_value] = a; + auto &&[b_index, b_value] = b; + auto &&[a_i, a_j] = a_index; + auto &&[b_i, b_j] = b_index; + if (a_i < b_i) { + return true; + } else if (a_i == b_i) { + if (a_j < b_j) { + return true; + } + } + return false; + }; + + std::sort(matrix.begin(), matrix.end(), sort_fn); + + f.close(); + + return matrix; +} + +template +void destroy_csr_matrix_view(dr::views::csr_matrix_view view, + Allocator &&alloc) { + alloc.deallocate(view.values_data(), view.size()); + typename std::allocator_traits::template rebind_alloc i_alloc( + alloc); + i_alloc.deallocate(view.colind_data(), view.size()); + i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1); +} + +} // namespace __detail + +template +auto read_csr(std::string file_path, bool one_indexed = true) { + auto m = __detail::read_coo_matrix(file_path, one_indexed); + auto shape = m.shape(); + auto nnz = m.size(); + + return __detail::convert_to_csr(m, shape, nnz, std::allocator{}); +} +} \ No newline at end of file diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index dbd19f679d..477b6d3984 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -48,6 +48,10 @@ #include #include #include +#include +#include +#include +#include #include #include diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 1db057245c..ad29fbeee8 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once -#include -#include -#include +#include +#include +#include namespace dr::mp { @@ -18,15 +18,15 @@ concept matrix_distibution = {t.nnz()} -> std::same_as; {t.get_segment_from_offset(int())} -> std::same_as; {t.get_id_in_segment(int())} -> std::same_as; - T(dr::sp::csr_matrix_view(), distribution()); + T(dr::views::csr_matrix_view(), distribution()); }; -template > +template > requires(matrix_distibution) class distributed_sparse_matrix { public: - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using elem_type = T; using index_type = I; using key_type = dr::index; @@ -121,7 +121,7 @@ class distributed_sparse_matrix { distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } /// Constructor - distributed_sparse_matrix(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()): distribution_(csr_view, dist) {} + distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, distribution dist = distribution()): distribution_(csr_view, dist) {} /// Returns iterator to beginning auto begin() const { return iterator(this, 0); } diff --git a/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp similarity index 87% rename from include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp rename to include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index da12c07886..fe0269f9cf 100644 --- a/include/dr/mp/containers/matrix_formats/csr_matrix_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -2,30 +2,30 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once -#include -#include -#include +#include +#include +#include namespace dr::mp { template -class csr_matrix_distribution { +class csr_eq_distribution { public: - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using elem_type = T; using index_type = I; using difference_type = std::ptrdiff_t; - csr_matrix_distribution(const csr_matrix_distribution &) = delete; - csr_matrix_distribution &operator=(const csr_matrix_distribution &) = delete; - csr_matrix_distribution(csr_matrix_distribution &&) { assert(false); } + csr_eq_distribution(const csr_eq_distribution &) = delete; + csr_eq_distribution &operator=(const csr_eq_distribution &) = delete; + csr_eq_distribution(csr_eq_distribution &&) { assert(false); } /// Constructor - csr_matrix_distribution(dr::sp::csr_matrix_view csr_view, distribution dist = distribution()) { + csr_eq_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution()) { init(csr_view, dist); } - ~csr_matrix_distribution() { + ~csr_eq_distribution() { if (!finalized()) { fence(); if (rows_data_ != nullptr) { @@ -80,12 +80,12 @@ class csr_matrix_distribution { return std::pair(row_offsets_[rank], row_offsets_[rank] + row_sizes_[rank]); } private: - friend csr_segment_iterator; + friend csr_eq_segment_iterator; std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; } - void init(dr::sp::csr_matrix_view csr_view, auto dist) { + void init(dr::views::csr_matrix_view csr_view, auto dist) { nnz_ = csr_view.size(); distribution_ = dist; shape_ = csr_view.shape(); @@ -155,7 +155,7 @@ class csr_matrix_distribution { distribution distribution_; dr::index shape_; std::size_t nnz_; - std::vector> segments_; + std::vector> segments_; std::shared_ptr> vals_data_; std::shared_ptr> cols_data_; }; diff --git a/include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp similarity index 83% rename from include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp rename to include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index 947d65da57..af62dee919 100644 --- a/include/dr/mp/containers/matrix_formats/csr_matrix_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -6,17 +6,17 @@ namespace dr::mp { -template class csr_segment_iterator; +template class csr_eq_segment_iterator; -template class csr_segment_reference { - using iterator = csr_segment_iterator; +template class csr_eq_segment_reference { + using iterator = csr_eq_segment_iterator; public: using value_type = typename DSM::value_type; using index_type = typename DSM::index_type; using elem_type = typename DSM::elem_type; - csr_segment_reference(const iterator it) : iterator_(it) {} + csr_eq_segment_reference(const iterator it) : iterator_(it) {} operator value_type() const { return iterator_.get(); } operator std::pair, elem_type>() const { @@ -36,7 +36,7 @@ template class csr_segment_reference { iterator_.put(value); return *this; } - auto operator=(const csr_segment_reference &other) const { + auto operator=(const csr_eq_segment_reference &other) const { *this = value_type(other); return *this; } @@ -44,23 +44,23 @@ template class csr_segment_reference { private: const iterator iterator_; -}; // csr_segment_reference +}; // csr_eq_segment_reference -template class csr_segment_iterator { +template class csr_eq_segment_iterator { public: using value_type = typename DSM::value_type; using index_type = typename DSM::index_type; using elem_type = typename DSM::elem_type; using difference_type = typename DSM::difference_type; - csr_segment_iterator() = default; - csr_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + csr_eq_segment_iterator() = default; + csr_eq_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { dsm_ = dsm; segment_index_ = segment_index; index_ = index; } - auto operator<=>(const csr_segment_iterator &other) const noexcept { + auto operator<=>(const csr_eq_segment_iterator &other) const noexcept { // assertion below checks against compare dereferenceable iterator to a // singular iterator and against attempt to compare iterators from different // sequences like _Safe_iterator does @@ -71,7 +71,7 @@ template class csr_segment_iterator { } // Comparison - bool operator==(const csr_segment_iterator &other) const noexcept { + bool operator==(const csr_eq_segment_iterator &other) const noexcept { return (*this <=> other) == 0; } @@ -85,7 +85,7 @@ template class csr_segment_iterator { auto &operator-=(difference_type n) { return *this += (-n); } - difference_type operator-(const csr_segment_iterator &other) const noexcept { + difference_type operator-(const csr_eq_segment_iterator &other) const noexcept { assert(dsm_ != nullptr && dsm_ == other.dsm_); assert(index_ >= other.index_); return index_ - other.index_; @@ -125,14 +125,14 @@ template class csr_segment_iterator { } // When *this is not first in the expression - friend auto operator+(difference_type n, const csr_segment_iterator &other) { + friend auto operator+(difference_type n, const csr_eq_segment_iterator &other) { return other + n; } // dereference auto operator*() const { assert(dsm_ != nullptr); - return csr_segment_reference{*this}; + return csr_eq_segment_reference{*this}; } auto operator[](difference_type n) const { assert(dsm_ != nullptr); @@ -235,16 +235,16 @@ template class csr_segment_iterator { DSM *dsm_ = nullptr; std::size_t segment_index_ = 0; std::size_t index_ = 0; -}; // csr_segment_iterator +}; // csr_eq_segment_iterator -template class csr_segment { +template class csr_eq_segment { private: - using iterator = csr_segment_iterator; + using iterator = csr_eq_segment_iterator; public: using difference_type = std::ptrdiff_t; - csr_segment() = default; - csr_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + csr_eq_segment() = default; + csr_eq_segment(DSM *dsm, std::size_t segment_index, std::size_t size, std::size_t reserved) { dsm_ = dsm; segment_index_ = segment_index; @@ -271,16 +271,16 @@ template class csr_segment { std::size_t segment_index_; std::size_t size_; std::size_t reserved_; -}; // csr_segment +}; // csr_eq_segment } // namespace dr::mp namespace std { template - struct tuple_size> : std::integral_constant {}; + struct tuple_size> : std::integral_constant {}; template - struct tuple_element> + struct tuple_element> : tuple_element, typename DSM::elem_type>> {}; } // namespace std diff --git a/include/dr/sp/algorithms/matrix/local_gemv.hpp b/include/dr/sp/algorithms/matrix/local_gemv.hpp index 302b14c789..f8e272bbbd 100644 --- a/include/dr/sp/algorithms/matrix/local_gemv.hpp +++ b/include/dr/sp/algorithms/matrix/local_gemv.hpp @@ -19,7 +19,7 @@ namespace __detail { template requires(std::is_same_v, T>) -auto custom_gemv(sycl::queue &q, csr_matrix_view a, Iter b, +auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, Iter c, const std::vector &dependencies = {}) { std::size_t wg = 32; @@ -55,7 +55,7 @@ auto custom_gemv(sycl::queue &q, csr_matrix_view a, Iter b, template requires(std::is_same_v, T>) -auto mkl_gemv(sycl::queue &q, csr_matrix_view a, Iter b, Iter c, +auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, Iter c, const std::vector &dependencies = {}) { oneapi::mkl::sparse::matrix_handle_t a_handle; @@ -78,7 +78,7 @@ auto mkl_gemv(sycl::queue &q, csr_matrix_view a, Iter b, Iter c, template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, Iter c, const std::vector &dependencies = {}) { return mkl_gemv(q, a, b, c, dependencies); } @@ -88,7 +88,7 @@ auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, csr_matrix_view a, Iter b, +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, Iter c, const std::vector &dependencies = {}) { return custom_gemv(q, a, b, c, dependencies); } diff --git a/include/dr/sp/containers/distributed_dense_matrix.hpp b/include/dr/sp/containers/distributed_dense_matrix.hpp index de6b1fa332..e316f2dad0 100644 --- a/include/dr/sp/containers/distributed_dense_matrix.hpp +++ b/include/dr/sp/containers/distributed_dense_matrix.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -25,9 +25,9 @@ template class distributed_dense_matrix_accessor { using scalar_value_type = rng::range_value_t; using scalar_reference = rng::range_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; @@ -138,15 +138,15 @@ template class distributed_dense_matrix { using size_type = std::size_t; using difference_type = std::ptrdiff_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_reference = rng::range_reference_t< dr::sp::device_vector>>; using const_scalar_reference = rng::range_reference_t< const dr::sp::device_vector>>; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref; using key_type = dr::index<>; diff --git a/include/dr/sp/containers/sequential/dense_matrix.hpp b/include/dr/sp/containers/sequential/dense_matrix.hpp index 5e4ccb8723..58d77c7ecd 100644 --- a/include/dr/sp/containers/sequential/dense_matrix.hpp +++ b/include/dr/sp/containers/sequential/dense_matrix.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -26,7 +26,7 @@ class dense_matrix { using scalar_pointer = typename std::allocator_traits::pointer; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using key_type = dr::index<>; using map_type = T; diff --git a/include/dr/sp/containers/sparse_matrix.hpp b/include/dr/sp/containers/sparse_matrix.hpp index 96746d8398..c9bfe90ee9 100644 --- a/include/dr/sp/containers/sparse_matrix.hpp +++ b/include/dr/sp/containers/sparse_matrix.hpp @@ -6,13 +6,13 @@ #include #include -#include +#include #include #include #include #include -#include -#include +#include +#include #include namespace dr::sp { @@ -128,19 +128,19 @@ template class sparse_matrix { using size_type = std::size_t; using difference_type = std::ptrdiff_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; using scalar_reference = rng::range_reference_t< dr::sp::device_vector>>; using const_scalar_reference = rng::range_reference_t< const dr::sp::device_vector>>; - using reference = dr::sp::matrix_ref; - using const_reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; + using const_reference = dr::matrix_ref; using key_type = dr::index; - using segment_type = dr::sp::csr_matrix_view< + using segment_type = dr::views::csr_matrix_view< T, I, rng::iterator_t>>, rng::iterator_t>>>; @@ -201,7 +201,7 @@ template class sparse_matrix { // in `gemv_benchmark`. I believe this is a SYCL bug. template auto copy_tile_async(key_type tile_index, - csr_matrix_view tile_view) { + dr::views::csr_matrix_view tile_view) { std::size_t tile_idx = tile_index[0] * grid_shape_[1] + tile_index[1]; auto &&values = values_[tile_idx]; auto &&colind = colind_[tile_idx]; @@ -241,7 +241,7 @@ template class sparse_matrix { template void copy_tile(key_type tile_index, - csr_matrix_view tile_view) { + dr::views::csr_matrix_view tile_view) { copy_tile_async(tile_index, tile_view).wait(); } diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index cf26fcf2d3..bc6f51176b 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -12,220 +12,17 @@ #include #include -#include -#include +#include +#include namespace dr::sp { -namespace __detail { - -// Preconditions: -// 1) `tuples` sorted by row, column -// 2) `tuples` has shape `shape` -// 3) `tuples` has `nnz` elements -template -auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, - Allocator &&allocator) { - auto &&[index, v] = *tuples.begin(); - auto &&[i, j] = index; - - using T = std::remove_reference_t; - using I = std::remove_reference_t; - - typename std::allocator_traits::template rebind_alloc - i_allocator(allocator); - - T *values = allocator.allocate(nnz); - I *rowptr = i_allocator.allocate(shape[0] + 1); - I *colind = i_allocator.allocate(nnz); - - rowptr[0] = 0; - - std::size_t r = 0; - std::size_t c = 0; - for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { - auto &&[index, value] = *iter; - auto &&[i, j] = index; - - values[c] = value; - colind[c] = j; - - while (r < i) { - assert(r + 1 <= shape[0]); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - rowptr[r + 1] = c; - r++; - } - c++; - - assert(c <= nnz); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - } - - for (; r < shape[0]; r++) { - rowptr[r + 1] = nnz; - } - - return csr_matrix_view(values, rowptr, colind, - dr::index(shape[0], shape[1]), nnz, 0); -} - -/// Read in the Matrix Market file at location `file_path` and a return -/// a coo_matrix data structure with its contents. -template -inline coo_matrix mmread(std::string file_path, bool one_indexed = true) { - using size_type = std::size_t; - - std::ifstream f; - - f.open(file_path.c_str()); - - if (!f.is_open()) { - // TODO better choice of exception. - throw std::runtime_error("mmread: cannot open " + file_path); - } - - std::string buf; - - // Make sure the file is matrix market matrix, coordinate, and check whether - // it is symmetric. If the matrix is symmetric, non-diagonal elements will - // be inserted in both (i, j) and (j, i). Error out if skew-symmetric or - // Hermitian. - std::getline(f, buf); - std::istringstream ss(buf); - std::string item; - ss >> item; - if (item != "%%MatrixMarket") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - ss >> item; - if (item != "matrix") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - ss >> item; - if (item != "coordinate") { - throw std::runtime_error(file_path + - " could not be parsed as a Matrix Market file."); - } - bool pattern; - ss >> item; - if (item == "pattern") { - pattern = true; - } else { - pattern = false; - } - // TODO: do something with real vs. integer vs. pattern? - ss >> item; - bool symmetric; - if (item == "general") { - symmetric = false; - } else if (item == "symmetric") { - symmetric = true; - } else { - throw std::runtime_error(file_path + " has an unsupported matrix type"); - } - - bool outOfComments = false; - while (!outOfComments) { - std::getline(f, buf); - - if (buf[0] != '%') { - outOfComments = true; - } - } - - I m, n, nnz; - // std::istringstream ss(buf); - ss.clear(); - ss.str(buf); - ss >> m >> n >> nnz; - - // NOTE for symmetric matrices: `nnz` holds the number of stored values in - // the matrix market file, while `matrix.nnz_` will hold the total number of - // stored values (including "mirrored" symmetric values). - coo_matrix matrix({m, n}); - if (symmetric) { - matrix.reserve(2 * nnz); - } else { - matrix.reserve(nnz); - } - - size_type c = 0; - while (std::getline(f, buf)) { - I i, j; - T v; - std::istringstream ss(buf); - if (!pattern) { - ss >> i >> j >> v; - } else { - ss >> i >> j; - v = T(1); - } - if (one_indexed) { - i--; - j--; - } - - if (i >= m || j >= n) { - throw std::runtime_error( - "read_MatrixMarket: file has nonzero out of bounds."); - } - - matrix.push_back({{i, j}, v}); - - if (symmetric && i != j) { - matrix.push_back({{j, i}, v}); - } - - c++; - if (c > nnz) { - throw std::runtime_error("read_MatrixMarket: error reading Matrix Market " - "file, file has more nonzeros than reported."); - } - } - - auto sort_fn = [](const auto &a, const auto &b) { - auto &&[a_index, a_value] = a; - auto &&[b_index, b_value] = b; - auto &&[a_i, a_j] = a_index; - auto &&[b_i, b_j] = b_index; - if (a_i < b_i) { - return true; - } else if (a_i == b_i) { - if (a_j < b_j) { - return true; - } - } - return false; - }; - - std::sort(matrix.begin(), matrix.end(), sort_fn); - - f.close(); - - return matrix; -} - -template -void destroy_csr_matrix_view(dr::sp::csr_matrix_view view, - Allocator &&alloc) { - alloc.deallocate(view.values_data(), view.size()); - typename std::allocator_traits::template rebind_alloc i_alloc( - alloc); - i_alloc.deallocate(view.colind_data(), view.size()); - i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1); -} - -} // namespace __detail - template -auto create_distributed(dr::sp::csr_matrix_view local_mat, +auto create_distributed(dr::views::csr_matrix_view local_mat, const matrix_partition &partition) { dr::sp::sparse_matrix a(local_mat.shape(), partition); - std::vector> views; + std::vector> views; std::vector events; views.reserve(a.grid_shape()[0] * a.grid_shape()[1]); @@ -242,7 +39,7 @@ auto create_distributed(dr::sp::csr_matrix_view local_mat, auto submatrix_shape = dr::index(row_bounds[1] - row_bounds[0], column_bounds[1] - column_bounds[0]); - auto copied_submat = __detail::convert_to_csr( + auto copied_submat = dr::__detail::convert_to_csr( local_submat, submatrix_shape, rng::distance(local_submat), std::allocator{}); @@ -255,20 +52,12 @@ auto create_distributed(dr::sp::csr_matrix_view local_mat, __detail::wait(events); for (auto &&view : views) { - __detail::destroy_csr_matrix_view(view, std::allocator{}); + dr::__detail::destroy_csr_matrix_view(view, std::allocator{}); } return a; } -template -auto read_csr(std::string file_path, bool one_indexed = true) { - auto m = __detail::mmread(file_path, one_indexed); - auto shape = m.shape(); - auto nnz = m.size(); - - return __detail::convert_to_csr(m, shape, nnz, std::allocator{}); -} template auto mmread(std::string file_path, const matrix_partition &partition, @@ -277,7 +66,7 @@ auto mmread(std::string file_path, const matrix_partition &partition, auto a = create_distributed(local_mat, partition); - __detail::destroy_csr_matrix_view(local_mat, std::allocator{}); + dr::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); return a; } diff --git a/include/dr/sp/views/dense_column_view.hpp b/include/dr/sp/views/dense_column_view.hpp index 627c4faebf..25c2b607ea 100644 --- a/include/dr/sp/views/dense_column_view.hpp +++ b/include/dr/sp/views/dense_column_view.hpp @@ -5,7 +5,7 @@ #pragma once #include -#include +#include #include namespace dr::sp { @@ -17,9 +17,9 @@ template class dense_matrix_column_accessor { using scalar_value_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/dense_matrix_iterator.hpp b/include/dr/sp/views/dense_matrix_iterator.hpp index 85c5274357..8c5e9e929a 100644 --- a/include/dr/sp/views/dense_matrix_iterator.hpp +++ b/include/dr/sp/views/dense_matrix_iterator.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include @@ -22,9 +22,9 @@ template class dense_matrix_accessor { using scalar_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/dense_matrix_view.hpp b/include/dr/sp/views/dense_matrix_view.hpp index bc2bd86d30..f7f2930f2d 100644 --- a/include/dr/sp/views/dense_matrix_view.hpp +++ b/include/dr/sp/views/dense_matrix_view.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -24,7 +24,7 @@ class dense_matrix_view using difference_type = std::ptrdiff_t; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using key_type = dr::index<>; using map_type = T; diff --git a/include/dr/sp/views/dense_row_view.hpp b/include/dr/sp/views/dense_row_view.hpp index d3ccee2c1a..09b008aeee 100644 --- a/include/dr/sp/views/dense_row_view.hpp +++ b/include/dr/sp/views/dense_row_view.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include namespace dr::sp { @@ -18,9 +18,9 @@ template class dense_matrix_row_accessor { using scalar_value_type = std::iter_value_t; using scalar_reference = std::iter_reference_t; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; diff --git a/include/dr/sp/views/csr_matrix_view.hpp b/include/dr/views/csr_matrix_view.hpp similarity index 96% rename from include/dr/sp/views/csr_matrix_view.hpp rename to include/dr/views/csr_matrix_view.hpp index ecc6350be5..65259bdf76 100644 --- a/include/dr/sp/views/csr_matrix_view.hpp +++ b/include/dr/views/csr_matrix_view.hpp @@ -5,10 +5,10 @@ #pragma once #include -#include +#include #include -namespace dr::sp { +namespace dr::views { template class csr_matrix_view_accessor { @@ -21,9 +21,9 @@ class csr_matrix_view_accessor { using index_type = I; - using value_type = dr::sp::matrix_entry; + using value_type = dr::matrix_entry; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using iterator_category = std::random_access_iterator_tag; @@ -129,7 +129,7 @@ class csr_matrix_view using difference_type = std::ptrdiff_t; using scalar_reference = std::iter_reference_t; - using reference = dr::sp::matrix_ref; + using reference = dr::matrix_ref; using scalar_type = T; using index_type = I; @@ -222,4 +222,4 @@ csr_matrix_view(TIter, IIter, IIter, Args &&...) -> csr_matrix_view, std::iter_value_t, TIter, IIter>; -} // namespace dr::sp +} // namespace dr::view From 2456379c4a7b4b8f4cecac9349ae5b2c09e19f6a Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 27 Aug 2024 10:32:37 +0200 Subject: [PATCH 06/68] Separated matrix format from mp sparse matrix implementation and added second format --- examples/mp/sparse_matrix.cpp | 40 +-- include/dr/detail/matrix_io.hpp | 13 +- include/dr/mp/algorithms/matrix/gemv.hpp | 22 +- .../containers/distributed_sparse_matrix.hpp | 10 +- .../matrix_formats/csr_eq_distribution.hpp | 49 +++- .../matrix_formats/csr_row_distribution.hpp | 201 +++++++++++++ .../matrix_formats/csr_row_segment.hpp | 276 ++++++++++++++++++ 7 files changed, 550 insertions(+), 61 deletions(-) create mode 100644 include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp create mode 100644 include/dr/mp/containers/matrix_formats/csr_row_segment.hpp diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index 8c1cc765e1..43d0649257 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -26,27 +26,27 @@ int main(int argc, char **argv) { { mp::distributed_sparse_matrix m(local_data); fmt::print("{}\n", m.size()); - // for (int i = 0; i < dr::mp::default_comm().size(); i++) { - // if (dr::mp::default_comm().rank() == i) { - // auto csr_iter = local_data.begin(); - // int j = 0; - // // fmt::print("{}\n", i); - // for (auto [index, val]: m) { - // auto [m, n] = index; + for (int i = 0; i < dr::mp::default_comm().size(); i++) { + if (dr::mp::default_comm().rank() == i) { + auto csr_iter = local_data.begin(); + int j = 0; + // fmt::print("{}\n", i); + for (auto [index, val]: m) { + auto [m, n] = index; - // auto [index_csr, val_csr] = *csr_iter; - // auto [m_csr, n_csr] = index_csr; - // auto check = m == m_csr && n_csr == n && val == val_csr; - // if (!check) { - // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); - // } - // assert(check); - // csr_iter++; - // j++; - // } - // } - // m.fence(); - // } + auto [index_csr, val_csr] = *csr_iter; + auto [m_csr, n_csr] = index_csr; + auto check = m == m_csr && n_csr == n && val == val_csr; + if (!check) { + fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); + } + // assert(check); + csr_iter++; + j++; + } + } + m.fence(); + } std::vector res(m.shape().first); std::vector a(m.shape().second); diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index ff429c2b2b..a64aabe6ee 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -185,16 +185,13 @@ inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed "file, file has more nonzeros than reported."); } } - auto sort_fn = [](const auto &a, const auto &b) { - auto &&[a_index, a_value] = a; - auto &&[b_index, b_value] = b; - auto &&[a_i, a_j] = a_index; - auto &&[b_i, b_j] = b_index; - if (a_i < b_i) { + auto a_index = a.index(); + auto b_index = b.index(); + if (a_index.first < b_index.first) { return true; - } else if (a_i == b_i) { - if (a_j < b_j) { + } else if (a_index.first == b_index.first) { + if (a_index.second < b_index.second) { return true; } } diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index cf1d0c54a9..2e6357771d 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -26,28 +26,8 @@ void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { } communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); - // multiply b by local segment - auto res_alloc = alloc.allocate(a.shape().first); - a.local_gemv(res_alloc, broadcasted_b); - - // reduce result by adding partial results - if (default_comm().rank() == root) { - auto gathered_res = alloc.allocate(a.shape().first * communicator.size()); - communicator.gather(res_alloc, gathered_res, a.shape().first, root); - rng::fill(res, 0); - for (int i = 0; i < communicator.size(); i++) { - auto row_bounds = a.local_row_bounds(i); - for (int j = row_bounds.first; j < row_bounds.second; j++) { - res[j] += gathered_res[a.shape().first * i + j - row_bounds.first]; - } - } - alloc.deallocate(gathered_res, a.shape().first * communicator.size()); - } - else { - communicator.gather(res_alloc, static_cast(nullptr), a.shape().first, root); - } + a.local_gemv_and_collect(root, res, broadcasted_b); alloc.deallocate(broadcasted_b, a.shape().second); - alloc.deallocate(res_alloc, a.shape().first); // a.fence(); // if (default_comm().rank() == root) { // for (int i = 0; i < a.shape().first; i++) { diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index ad29fbeee8..54610a05e1 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #pragma once #include +#include #include #include @@ -21,7 +22,7 @@ concept matrix_distibution = T(dr::views::csr_matrix_view(), distribution()); }; -template > +template > requires(matrix_distibution) class distributed_sparse_matrix { @@ -143,13 +144,10 @@ class distributed_sparse_matrix { } template - auto local_gemv(C &res, A &vals) const { - distribution_.local_gemv(res, vals); + auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + distribution_.local_gemv_and_collect(root, res, vals); } - auto local_row_bounds(std::size_t rank) const { - return distribution_.local_row_bounds(rank); - } private: MatrixDistrT distribution_; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index fe0269f9cf..5d384b1adf 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -50,19 +50,23 @@ class csr_eq_distribution { template auto local_gemv(C &res, A &vals) const { + auto rank = rows_backend_.getrank(); + if (nnz_ <= segment_size_ * rank) { + return; + } // if (dr::mp::use_sycl()) { // } // else { - auto rank = rows_backend_.getrank(); auto size = row_sizes_[rank]; auto row_i = -1; auto position = segment_size_ * rank; + auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank); auto current_row_position = rows_data_[0]; auto local_vals = dr::mp::local_segment(*vals_data_); auto local_cols = dr::mp::local_segment(*cols_data_); - for (int i = 0; i < segment_size_; i++) { + for (int i = 0; i < elem_count; i++) { while (row_i + 1 < size && position + i >= current_row_position) { row_i++; current_row_position = rows_data_[row_i + 1]; @@ -76,11 +80,42 @@ class csr_eq_distribution { // } // } } - auto local_row_bounds(std::size_t rank) const { - return std::pair(row_offsets_[rank], row_offsets_[rank] + row_sizes_[rank]); + + template + auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + assert(res.size() == shape_.first); + __detail::allocator alloc; + auto res_alloc = alloc.allocate(shape_.first); + local_gemv(res_alloc, vals); + + gather_gemv_vector(root, res, res_alloc); + alloc.deallocate(res_alloc, shape_.first); } private: friend csr_eq_segment_iterator; + + template + void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { + auto communicator = default_comm(); + __detail::allocator alloc; + if (communicator.rank() == root) { + auto gathered_res = alloc.allocate(shape_.first * communicator.size()); + communicator.gather(partial_res, gathered_res, shape_.first, root); + rng::fill(res, 0); + for (auto i = 0; i < communicator.size(); i++) { + auto first_row = row_offsets_[i]; + auto last_row = row_offsets_[i] + row_sizes_[i]; + for (auto j = first_row; j < last_row; j++) { + res[j] += gathered_res[shape_.first * i + j - first_row]; + } + } + alloc.deallocate(gathered_res, shape_.first * communicator.size()); + } + else { + communicator.gather(partial_res, static_cast(nullptr), shape_.first, root); + } + } + std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; } @@ -121,8 +156,10 @@ class csr_eq_distribution { fmt::print("hmmmm? {} {} {} {}\n", rank, lower_limit, row_size_, get_row_size(rank)); } - rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); - std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); + if (row_size_ > 0) { + rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); + } std::size_t segment_index = 0; segment_size_ = vals_data_->segment_size(); assert(segment_size_ == cols_data_->segment_size()); diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp new file mode 100644 index 0000000000..1c55e06628 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -0,0 +1,201 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause +#pragma once +#include +#include +#include +#include + +namespace dr::mp { + +template +class csr_row_distribution { +public: + using value_type = dr::matrix_entry; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; + + csr_row_distribution(const csr_row_distribution &) = delete; + csr_row_distribution &operator=(const csr_row_distribution &) = delete; + csr_row_distribution(csr_row_distribution &&) { assert(false); } + + /// Constructor + csr_row_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution()) { + init(csr_view, dist); + } + + ~csr_row_distribution() { + if (!finalized()) { + fence(); + if (vals_data_ != nullptr) { + vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); + cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); + } + + // delete halo_; TODO + } + } + std::size_t get_id_in_segment(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset) - 1; + return offset - *pos_iter; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset); + return rng::distance(val_offsets_.begin(), pos_iter) - 1; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const {return nnz_;} + auto shape() const {return shape_;} + void fence() { + vals_backend_.fence(); + cols_backend_.fence(); + } +template + auto local_gemv(C &res, A &vals) const { + auto rank = cols_backend_.getrank(); + if (shape_[0] <= segment_size_ * rank) return; + // if (dr::mp::use_sycl()) { + + // } + // else { + auto local_rows = dr::mp::local_segment(*rows_data_); + auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); + auto val_count = val_sizes_[rank]; + auto row_i = 0; + auto position = val_offsets_[rank]; + auto current_row_position = local_rows[1]; + + for (int i = 0; i < val_count; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = local_rows[row_i + 1]; + } + res[row_i] += vals_data_[i] * vals[cols_data_[i]]; + } + // } + } + + template + auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + assert(res.size() == shape_.first); + __detail::allocator alloc; + auto res_alloc = alloc.allocate(segment_size_); + local_gemv(res_alloc, vals); + + gather_gemv_vector(root, res, res_alloc); + alloc.deallocate(res_alloc, segment_size_); + } +private: + friend csr_row_segment_iterator; + + template + void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { + auto communicator = default_comm(); + __detail::allocator alloc; + if (communicator.rank() == root) { + auto scratch = alloc.allocate(segment_size_ * default_comm().size()); + communicator.gather(partial_res, scratch, segment_size_, root); + std::copy(scratch, scratch + shape_.first, res.begin()); + alloc.deallocate(scratch, segment_size_ * communicator.size()); + } + else { + communicator.gather(partial_res, static_cast(nullptr), segment_size_, root); + } + } + void init(dr::views::csr_matrix_view csr_view, auto dist) { + nnz_ = csr_view.size(); + distribution_ = dist; + shape_ = csr_view.shape(); + // determine the distribution of data + // auto hb = dist.halo(); + std::size_t gran = dist.granularity(); + // TODO: make this an error that is reported back to user + assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); + // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + + + auto rank = vals_backend_.getrank(); + rows_data_ = std::make_shared>(shape_.first); + + dr::mp::copy(std::ranges::subrange(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_.first), rows_data_->begin()); + + assert(*csr_view.rowptr_data() == 0); + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = rows_data_->get_segment_offset(i); + if (first_index > shape_.first) { + val_offsets_.push_back(nnz_); + val_sizes_.push_back(0); + continue; + } + std::size_t lower_limit = csr_view.rowptr_data()[first_index]; + std::size_t higher_limit = nnz_; + if (rows_data_->get_segment_offset(i + 1) < shape_.first) { + auto last_index = rows_data_->get_segment_offset(i + 1); + higher_limit = csr_view.rowptr_data()[last_index]; + } + val_offsets_.push_back(lower_limit); + val_sizes_.push_back(higher_limit - lower_limit); + } + + auto lower_limit = val_offsets_[rank]; + vals_size_ = std::max(val_sizes_[rank], static_cast(1)); + // fmt::print("dfsa {} {} {} {}\n", vals_size_, val_sizes_[rank],lower_limit, rank); + + cols_data_ = static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); + vals_data_ = static_cast(vals_backend_.allocate(vals_size_ * sizeof(T))); + std::copy(csr_view.values_data() + lower_limit, csr_view.values_data() + lower_limit + vals_size_, vals_data_); + std::copy(csr_view.colind_data() + lower_limit, csr_view.colind_data() + lower_limit + vals_size_, cols_data_); + + std::size_t segment_index = 0; + segment_size_ = rows_data_->segment_size(); + for (std::size_t i = 0; i < default_comm().size(); i++) { + //TODO fix segment creation, to include proper sizes, basing on val_offsets; + segments_.emplace_back(this, segment_index++, val_sizes_[i], std::max(val_sizes_[i], static_cast(1))); + } + // if (rank == 0) { + // int ax = 0; + // for (auto x: val_offsets_) { + // fmt::print("{} {}\n", ax++, x); + // } + // for (int i = 0; i < 49; i++) { + // fmt::print("{} {}\n", i, get_segment_from_offset(i)); + // } + // } + // fmt::print(" {} {} {} {}\n",get_segment_from_offset(47), get_segment_from_offset(48), get_segment_from_offset(49), get_segment_from_offset(50)); + // for (int i = 0; i < vals_size_; i++) { + // fmt::print("col, val, i, rank {} {} {} {}\n", cols_data_[i], vals_data_[i], i, rank); + // } + // fence(); + // if (rank < rows_data_->segments().size()) { + // for (int i = 0; i < rows_data_->segments()[rank].size(); i++) { + // fmt::print("row, i, rank {} {} {}\n", rows_data_->segments()[rank][i], i, rank); + // } + // } + fence(); + } + + + std::size_t segment_size_ = 0; + std::size_t vals_size_ = 0; + std::vector val_offsets_; + std::vector val_sizes_; + + + index_type *cols_data_ = nullptr; + BackendT cols_backend_; + + elem_type *vals_data_ = nullptr; + BackendT vals_backend_; + + distribution distribution_; + dr::index shape_; + std::size_t nnz_; + std::vector> segments_; + std::shared_ptr> rows_data_; +}; +} \ No newline at end of file diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp new file mode 100644 index 0000000000..f28abd23d3 --- /dev/null +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -0,0 +1,276 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + +template class csr_row_segment_iterator; + +template class csr_row_segment_reference { + using iterator = csr_row_segment_iterator; + +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + + csr_row_segment_reference(const iterator it) : iterator_(it) {} + + operator value_type() const { return iterator_.get(); } + operator std::pair, elem_type>() const { + return iterator_.get(); + } + + template auto get() const noexcept { + if constexpr (Index == 0) { + return iterator_.get_index(); + } + if constexpr (Index == 1) { + return iterator_.get_value(); + } + } + + auto operator=(const csr_row_segment_reference &other) const { + *this = value_type(other); + return *this; + } + auto operator&() const { return iterator_; } + +private: + const iterator iterator_; +}; // csr_row_segment_reference + +template class csr_row_segment_iterator { +public: + using value_type = typename DSM::value_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; + using difference_type = typename DSM::difference_type; + + csr_row_segment_iterator() = default; + csr_row_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + dsm_ = dsm; + segment_index_ = segment_index; + index_ = index; + } + + auto operator<=>(const csr_row_segment_iterator &other) const noexcept { + // assertion below checks against compare dereferenceable iterator to a + // singular iterator and against attempt to compare iterators from different + // sequences like _Safe_iterator does + assert(dsm_ == other.dsm_); + return segment_index_ == other.segment_index_ + ? index_ <=> other.index_ + : segment_index_ <=> other.segment_index_; + } + + // Comparison + bool operator==(const csr_row_segment_iterator &other) const noexcept { + return (*this <=> other) == 0; + } + + // Only this arithmetic manipulate internal state + auto &operator+=(difference_type n) { + assert(dsm_ != nullptr); + assert(n >= 0 || static_cast(index_) >= -n); + index_ += n; + return *this; + } + + auto &operator-=(difference_type n) { return *this += (-n); } + + difference_type operator-(const csr_row_segment_iterator &other) const noexcept { + assert(dsm_ != nullptr && dsm_ == other.dsm_); + assert(index_ >= other.index_); + return index_ - other.index_; + } + + // prefix + auto &operator++() { + *this += 1; + return *this; + } + auto &operator--() { + *this -= 1; + return *this; + } + + // postfix + auto operator++(int) { + auto prev = *this; + *this += 1; + return prev; + } + auto operator--(int) { + auto prev = *this; + *this -= 1; + return prev; + } + + auto operator+(difference_type n) const { + auto p = *this; + p += n; + return p; + } + auto operator-(difference_type n) const { + auto p = *this; + p -= n; + return p; + } + + // When *this is not first in the expression + friend auto operator+(difference_type n, const csr_row_segment_iterator &other) { + return other + n; + } + + // dereference + auto operator*() const { + assert(dsm_ != nullptr); + return csr_row_segment_reference{*this}; + } + auto operator[](difference_type n) const { + assert(dsm_ != nullptr); + return *(*this + n); + } + + void get(value_type *dst, std::size_t size) const { + auto elems = new elem_type[size]; + auto indexes = new dr::index[size]; + get_value(elems, size); + get_index(indexes, size); + for (std::size_t i = 0; i < size; i++) { + *(dst + i) = {indexes[i], elems[i]}; + } + } + + value_type get() const { + value_type val; + get(&val, 1); + return val; + } + + void get_value(elem_type *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + dsm_->vals_backend_.getmem(dst, index_ * sizeof(elem_type), size * sizeof(elem_type), segment_index_); + } + + elem_type get_value() const { + elem_type val; + get_value(&val, 1); + return val; + } + + void get_index(dr::index *dst, std::size_t size) const { + assert(dsm_ != nullptr); + assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); + index_type *col_data; + if (rank() == dsm_->cols_backend_.getrank()) { + col_data = dsm_->cols_data_ + index_; + } + else { + col_data = new index_type[size]; + dsm_->cols_backend_.getmem(col_data, index_ * sizeof(index_type), size * sizeof(index_type), segment_index_); + } + index_type *rows; + std::size_t rows_length = dsm_->segment_size_; + rows = new index_type[rows_length]; + (dsm_->rows_data_->segments()[segment_index_].begin()).get(rows, rows_length); + + auto position = dsm_->val_offsets_[segment_index_] + index_; + auto rows_iter = rows + 1; + index_type *cols_iter = col_data; + auto iter = dst; + std::size_t current_row = dsm_->segment_size_ * segment_index_; + std::size_t last_row = std::min(current_row + rows_length - 1, dsm_->shape_[0] - 1); + + for (int i = 0; i < size; i++) { + while (current_row < last_row && *rows_iter <= position + i ) { + rows_iter++; + current_row++; + } + iter->first = current_row; + iter->second = *cols_iter; + cols_iter++; + iter++; + } + if (rank() != dsm_->cols_backend_.getrank()) { + delete[] col_data; + } + delete[] rows; + + } + + dr::index get_index() const { + dr::index val; + get_index(&val, 1); + return val; + } + + auto rank() const { + assert(dsm_ != nullptr); + return segment_index_; + } + + auto segments() const { + assert(dsm_ != nullptr); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); + } + +private: + // all fields need to be initialized by default ctor so every default + // constructed iter is equal to any other default constructed iter + DSM *dsm_ = nullptr; + std::size_t segment_index_ = 0; + std::size_t index_ = 0; +}; // csr_row_segment_iterator + +template class csr_row_segment { +private: + using iterator = csr_row_segment_iterator; + +public: + using difference_type = std::ptrdiff_t; + csr_row_segment() = default; + csr_row_segment(DSM *dsm, std::size_t segment_index, std::size_t size, + std::size_t reserved) { + dsm_ = dsm; + segment_index_ = segment_index; + size_ = size; + reserved_ = reserved; + assert(dsm_ != nullptr); + } + + auto size() const { + assert(dsm_ != nullptr); + return size_; + } + + auto begin() const { return iterator(dsm_, segment_index_, 0); } + auto end() const { return begin() + size(); } + auto reserved() const { return reserved_; } + + auto operator[](difference_type n) const { return *(begin() + n); } + + bool is_local() const { return segment_index_ == default_comm().rank(); } + +private: + DSM *dsm_ = nullptr; + std::size_t segment_index_; + std::size_t size_; + std::size_t reserved_; +}; // csr_row_segment + +} // namespace dr::mp + +namespace std { + template + struct tuple_size> : std::integral_constant {}; + + template + struct tuple_element> + : tuple_element, typename DSM::elem_type>> {}; + +} // namespace std From bd63c2ccb09976a12a35f1dc1a906df7dc9473a0 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 28 Aug 2024 09:10:09 +0200 Subject: [PATCH 07/68] Improve matrix loading performance --- include/dr/detail/matrix_entry.hpp | 11 ++++------- include/dr/detail/matrix_io.hpp | 17 ++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/include/dr/detail/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp index e56251f87c..6b35e12a01 100644 --- a/include/dr/detail/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -68,14 +68,11 @@ template class matrix_entry { return matrix_entry, U>(index_, value_); } - bool operator<(const matrix_entry &other) const noexcept { - if (index()[0] < other.index()[0]) { - return true; - } else if (index()[0] == other.index()[0] && - index()[1] < other.index()[1]) { - return true; + inline bool operator<(const matrix_entry &other) const noexcept { + if (index_.first != other.index_.first) { + return index_.first < other.index_.first; } - return false; + return index_.second < other.index_.second; } matrix_entry() = default; diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index a64aabe6ee..83c8aa49cb 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -185,21 +186,7 @@ inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed "file, file has more nonzeros than reported."); } } - auto sort_fn = [](const auto &a, const auto &b) { - auto a_index = a.index(); - auto b_index = b.index(); - if (a_index.first < b_index.first) { - return true; - } else if (a_index.first == b_index.first) { - if (a_index.second < b_index.second) { - return true; - } - } - return false; - }; - - std::sort(matrix.begin(), matrix.end(), sort_fn); - + std::sort(matrix.begin(), matrix.end()); f.close(); return matrix; From b75b7ed2d4f10a0239b710417c85c68b0d885cd6 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 3 Sep 2024 11:59:26 +0200 Subject: [PATCH 08/68] Add sycl support to mp sparse matrixes --- examples/mp/sparse_matrix.cpp | 53 +++++++----- include/dr/mp/algorithms/matrix/gemv.hpp | 5 +- .../containers/distributed_sparse_matrix.hpp | 5 +- .../matrix_formats/csr_eq_distribution.hpp | 86 ++++++++++++++++--- .../matrix_formats/csr_row_distribution.hpp | 43 ++++++++-- include/dr/sp/util/matrix_io.hpp | 7 ++ 6 files changed, 152 insertions(+), 47 deletions(-) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index 43d0649257..55cf0e7cc9 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -24,40 +24,42 @@ int main(int argc, char **argv) { #endif { - mp::distributed_sparse_matrix m(local_data); + mp::distributed_sparse_matrix> m(local_data); + mp::distributed_sparse_matrix> m_row(local_data); fmt::print("{}\n", m.size()); - for (int i = 0; i < dr::mp::default_comm().size(); i++) { - if (dr::mp::default_comm().rank() == i) { - auto csr_iter = local_data.begin(); - int j = 0; - // fmt::print("{}\n", i); - for (auto [index, val]: m) { - auto [m, n] = index; + // for (int i = 0; i < dr::mp::default_comm().size(); i++) { + // if (dr::mp::default_comm().rank() == i) { + // auto csr_iter = local_data.begin(); + // int j = 0; + // // fmt::print("{}\n", i); + // for (auto [index, val]: m) { + // auto [m, n] = index; - auto [index_csr, val_csr] = *csr_iter; - auto [m_csr, n_csr] = index_csr; - auto check = m == m_csr && n_csr == n && val == val_csr; - if (!check) { - fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); - } - // assert(check); - csr_iter++; - j++; - } - } - m.fence(); - } + // auto [index_csr, val_csr] = *csr_iter; + // auto [m_csr, n_csr] = index_csr; + // auto check = m == m_csr && n_csr == n && val == val_csr; + // if (!check) { + // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); + // } + // // assert(check); + // csr_iter++; + // j++; + // } + // } + // m.fence(); + // } std::vector res(m.shape().first); + std::vector res_row(m.shape().first); std::vector a(m.shape().second); for (int i = 0; i < a.size(); i++) { a[i] = i; } m.fence(); - fmt::print("gemv started\n"); gemv(0, res, m, a); m.fence(); - fmt::print("gemv finished\n"); + gemv(0, res_row, m_row, a); + m_row.fence(); std::vector ref(m.shape().first); if (dr::mp::default_comm().rank() == 0) { @@ -71,6 +73,11 @@ int main(int argc, char **argv) { fmt::print("mismatching outcome {} {}\n", res[i], ref[i]); } } + for (int i = 0; i < m.shape().first; i++) { + if (res_row[i] != ref[i]) { + fmt::print("mismatching outcome row {} {}\n", res_row[i], ref[i]); + } + } } } dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index 2e6357771d..bd3d54d58b 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -11,8 +11,8 @@ namespace dr::mp { -template C, rng::input_range B, typename Backend> //TODO?:, typename MatDistr> -void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { +template C, rng::input_range B, typename Backend, typename MatDistr> +void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { if (default_comm().rank() == root) { assert(a.shape().first == res.size()); assert(a.shape().second == b.size()); @@ -25,7 +25,6 @@ void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { rng::copy(b.begin(), b.end(), broadcasted_b); } communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); - a.local_gemv_and_collect(root, res, broadcasted_b); alloc.deallocate(broadcasted_b, a.shape().second); // a.fence(); diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 54610a05e1..0b8412331b 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -11,7 +11,7 @@ namespace dr::mp { template concept matrix_distibution = - requires(T t) { + requires(T t, std::vector res, int* input) { {t.fence()} -> std::same_as; { t.segments() } -> rng::random_access_range; {t.shape().first} -> std::convertible_to; @@ -20,9 +20,10 @@ concept matrix_distibution = {t.get_segment_from_offset(int())} -> std::same_as; {t.get_id_in_segment(int())} -> std::same_as; T(dr::views::csr_matrix_view(), distribution()); + t.local_gemv_and_collect(int(), res, input); }; -template > +template > requires(matrix_distibution) class distributed_sparse_matrix { diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 5d384b1adf..02c7229d0a 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -54,11 +54,57 @@ class csr_eq_distribution { if (nnz_ <= segment_size_ * rank) { return; } - // if (dr::mp::use_sycl()) { - - // } - // else { - auto size = row_sizes_[rank]; + auto size = row_sizes_[rank]; + if (dr::mp::use_sycl()) { + auto localVals = dr::__detail::direct_iterator(dr::mp::local_segment(*vals_data_).begin()); + auto localCols = dr::__detail::direct_iterator(dr::mp::local_segment(*cols_data_).begin()); + auto offset = rank * segment_size_; + auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); + auto local_data = rows_data_; + // dr::mp::sycl_queue().submit([&](auto& cgh) { + // cgh.parallel_for(sycl::range<1> { real_segment_size }, + // [=](auto idx) { + // auto colNum = localCols[idx]; + // auto matrixVal = vals[colNum]; + // auto vectorVal = localVals[idx]; + // auto row = std::distance(std::upper_bound(local_data, local_data + row_size, offset + idx), local_data) - 1; + // *(res + row) += matrixVal * vectorVal; + // }); + // }).wait(); + auto one_computation_size = (real_segment_size + max_row_size_ - 1) / max_row_size_; + auto row_size = row_size_; + dr::mp::sycl_queue().submit([&](auto& cgh) { + cgh.parallel_for(sycl::range<1> { max_row_size_ }, + [=](auto idx) { + std::size_t lower_bound = one_computation_size * idx; + std::size_t upper_bound = std::min(one_computation_size * (idx + 1), real_segment_size); + std::size_t position = lower_bound + offset; + std::size_t first_row = std::distance(local_data, std::upper_bound(local_data, local_data + row_size, position) - 1); + auto row = first_row; + T sum = 0; + for (auto i = lower_bound; i < upper_bound; i++) { + while (row + 1 < row_size && local_data[row + 1] <= offset + i) { + sycl::atomic_ref + c_ref(res[row]); + c_ref += sum; + row++; + sum = 0; + } + auto colNum = localCols[i]; + auto matrixVal = vals[colNum]; + auto vectorVal = localVals[i]; + + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(res[row]); + c_ref += sum; + }); + }).wait(); + } + else { auto row_i = -1; auto position = segment_size_ * rank; auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank); @@ -78,18 +124,25 @@ class csr_eq_distribution { // for (int i = 0; i < size; i++) { // fmt::print("ledata, rank, i {} {} {}\n", res[i], rows_backend_.getrank(), i); // } - // } + } } template auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { assert(res.size() == shape_.first); __detail::allocator alloc; - auto res_alloc = alloc.allocate(shape_.first); + auto res_alloc = alloc.allocate(max_row_size_); + for (auto i = 0; i < max_row_size_; i++) { + res_alloc[i] = 0; + } + auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + fmt::print("eq gemv time {}\n", duration * 1000); gather_gemv_vector(root, res, res_alloc); - alloc.deallocate(res_alloc, shape_.first); + alloc.deallocate(res_alloc, max_row_size_); } private: friend csr_eq_segment_iterator; @@ -99,20 +152,25 @@ class csr_eq_distribution { auto communicator = default_comm(); __detail::allocator alloc; if (communicator.rank() == root) { - auto gathered_res = alloc.allocate(shape_.first * communicator.size()); - communicator.gather(partial_res, gathered_res, shape_.first, root); + auto gathered_res = alloc.allocate(max_row_size_ * communicator.size()); + communicator.gather(partial_res, gathered_res, max_row_size_, root); rng::fill(res, 0); + + // auto begin = std::chrono::high_resolution_clock::now(); for (auto i = 0; i < communicator.size(); i++) { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j] += gathered_res[shape_.first * i + j - first_row]; + res[j] += gathered_res[max_row_size_ * i + j - first_row]; } } - alloc.deallocate(gathered_res, shape_.first * communicator.size()); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("gather time {}\n", duration); + alloc.deallocate(gathered_res, max_row_size_ * communicator.size()); } else { - communicator.gather(partial_res, static_cast(nullptr), shape_.first, root); + communicator.gather(partial_res, static_cast(nullptr), max_row_size_, root); } } @@ -148,6 +206,7 @@ class csr_eq_distribution { auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); row_offsets_.push_back(lower_limit); row_sizes_.push_back(higher_limit - lower_limit); + max_row_size_ = std::max(max_row_size_, row_sizes_.back()); } auto lower_limit = row_offsets_[rank]; @@ -182,6 +241,7 @@ class csr_eq_distribution { std::size_t segment_size_ = 0; std::size_t row_size_ = 0; + std::size_t max_row_size_ = 0; std::vector row_offsets_; std::vector row_sizes_; diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 1c55e06628..8ac447ae3f 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -58,12 +58,35 @@ template auto local_gemv(C &res, A &vals) const { auto rank = cols_backend_.getrank(); if (shape_[0] <= segment_size_ * rank) return; - // if (dr::mp::use_sycl()) { - - // } - // else { + auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); + if (dr::mp::use_sycl()) { + auto local_vals = vals_data_; + auto local_cols = cols_data_; + auto offset = val_offsets_[rank]; + auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]); + auto rows_data = dr::__detail::direct_iterator(dr::mp::local_segment(*rows_data_).begin()); + dr::mp::sycl_queue().submit([&](auto& cgh) { + cgh.parallel_for(sycl::range<1> { size }, + [=](auto idx) { + std::size_t lower_bound = 0; + if (rows_data[idx] > offset) { + lower_bound = rows_data[idx] - offset; + } + std::size_t upper_bound = real_segment_size; + if (idx < size - 1) { + upper_bound = rows_data[idx + 1] - offset; + } + for (auto i = lower_bound; i < upper_bound; i++) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum]; + auto vectorVal = local_vals[i]; + *(res + idx) += matrixVal * vectorVal; + } + }); + }).wait(); + } + else { auto local_rows = dr::mp::local_segment(*rows_data_); - auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); auto val_count = val_sizes_[rank]; auto row_i = 0; auto position = val_offsets_[rank]; @@ -76,7 +99,7 @@ template } res[row_i] += vals_data_[i] * vals[cols_data_[i]]; } - // } + } } template @@ -84,7 +107,15 @@ template assert(res.size() == shape_.first); __detail::allocator alloc; auto res_alloc = alloc.allocate(segment_size_); + for (auto i = 0; i < segment_size_; i++) { + res_alloc[i] = 0; + } + + auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + fmt::print("rows gemv time {}\n", duration * 1000); gather_gemv_vector(root, res, res_alloc); alloc.deallocate(res_alloc, segment_size_); diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index bc6f51176b..6f71923b59 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -59,6 +59,13 @@ auto create_distributed(dr::views::csr_matrix_view local_mat, } +template +auto create_distributed(dr::views::csr_matrix_view local_mat) { + return create_distributed(local_mat, + dr::sp::block_cyclic({dr::sp::tile::div, dr::sp::tile::div}, + {dr::sp::nprocs(), 1})); +} + template auto mmread(std::string file_path, const matrix_partition &partition, bool one_indexed = true) { From 756fab1ba54305a22d1ae007e809a37da193e1d9 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 4 Sep 2024 13:04:52 +0200 Subject: [PATCH 09/68] Added initialization from one node in mp sparse matrix --- examples/mp/sparse_matrix.cpp | 44 ++++++++- .../containers/distributed_sparse_matrix.hpp | 2 +- .../matrix_formats/csr_eq_distribution.hpp | 95 +++++++++++------- .../matrix_formats/csr_row_distribution.hpp | 96 ++++++++++++------- include/dr/sp/util/matrix_io.hpp | 2 +- 5 files changed, 163 insertions(+), 76 deletions(-) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index 55cf0e7cc9..cb42afc823 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -16,16 +16,22 @@ int main(int argc, char **argv) { } std::string fname(argv[1]); - auto local_data = dr::read_csr(fname); #ifdef SYCL_LANGUAGE_VERSION mp::init(sycl::default_selector_v); #else mp::init(); #endif + dr::views::csr_matrix_view local_data; + auto root = 0; + if (root == dr::mp::default_comm().rank()) { + local_data = dr::read_csr(fname); + } { - mp::distributed_sparse_matrix> m(local_data); - mp::distributed_sparse_matrix> m_row(local_data); + fmt::print("started\n"); + mp::distributed_sparse_matrix> m(local_data, root); + fmt::print("hihihih\n"); + mp::distributed_sparse_matrix> m_row(local_data, root); fmt::print("{}\n", m.size()); // for (int i = 0; i < dr::mp::default_comm().size(); i++) { // if (dr::mp::default_comm().rank() == i) { @@ -56,9 +62,34 @@ int main(int argc, char **argv) { a[i] = i; } m.fence(); - gemv(0, res, m, a); + double total_time = 0; + auto N = 10; + gemv(0, res, m, a); // it is here to prepare sycl for work + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m, a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("eq canary {}\n", duration); + } + } + fmt::print("eq gemv time {}\n", total_time * 1000 / N); m.fence(); + total_time = 0; gemv(0, res_row, m_row, a); + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res_row, m_row, a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("row canary {}\n", duration); + } + } + fmt::print("row gemv time {}\n", total_time * 1000 / N); m_row.fence(); std::vector ref(m.shape().first); @@ -80,7 +111,10 @@ int main(int argc, char **argv) { } } } - dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + + if (root == dr::mp::default_comm().rank()) { + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } mp::finalize(); return 0; diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 0b8412331b..fcbd6d7650 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -123,7 +123,7 @@ class distributed_sparse_matrix { distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } /// Constructor - distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, distribution dist = distribution()): distribution_(csr_view, dist) {} + distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, std::size_t root = 0, distribution dist = distribution()): distribution_(csr_view, dist, root) {} /// Returns iterator to beginning auto begin() const { return iterator(this, 0); } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 02c7229d0a..5c8eb42bbb 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -21,8 +21,8 @@ class csr_eq_distribution { csr_eq_distribution(csr_eq_distribution &&) { assert(false); } /// Constructor - csr_eq_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution()) { - init(csr_view, dist); + csr_eq_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { + init(csr_view, dist, root); } ~csr_eq_distribution() { @@ -178,47 +178,71 @@ class csr_eq_distribution { return row_sizes_[rank]; } - void init(dr::views::csr_matrix_view csr_view, auto dist) { - nnz_ = csr_view.size(); + void init(dr::views::csr_matrix_view csr_view, auto dist, std::size_t root) { distribution_ = dist; - shape_ = csr_view.shape(); - // determine the distribution of data - // auto hb = dist.halo(); - std::size_t gran = dist.granularity(); - // TODO: make this an error that is reported back to user - assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + auto rank = rows_backend_.getrank(); + std::size_t initial_data[3]; + if (root == rank) { + initial_data[0] = csr_view.size(); + initial_data[1] = csr_view.shape().first; + initial_data[2] = csr_view.shape().second; + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } + else { + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } - auto rank = rows_backend_.getrank(); + nnz_ = initial_data[0]; + shape_ = {initial_data[1], initial_data[2]}; vals_data_ = std::make_shared>(nnz_); cols_data_ = std::make_shared>(nnz_); + dr::mp::copy(root, std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); + dr::mp::copy(root, std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); - dr::mp::copy(std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); - dr::mp::copy(std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); - - assert(*csr_view.rowptr_data() == 0); - for (int i = 0; i < default_comm().size(); i++) { - auto first_index = vals_data_->get_segment_offset(i); - auto last_index = vals_data_->get_segment_offset(i + 1) - 1; - auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; - auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); - row_offsets_.push_back(lower_limit); - row_sizes_.push_back(higher_limit - lower_limit); - max_row_size_ = std::max(max_row_size_, row_sizes_.back()); + auto row_info_size = default_comm().size() * 2 + 1; + __detail::allocator alloc; + std::size_t* row_information = new std::size_t[row_info_size]; + row_offsets_.reserve(default_comm().size()); + row_sizes_.reserve(default_comm().size()); + if (root == default_comm().rank()) { + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = vals_data_->get_segment_offset(i); + auto last_index = vals_data_->get_segment_offset(i + 1) - 1; + auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; + auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); + row_offsets_.push_back(lower_limit); + row_sizes_.push_back(higher_limit - lower_limit); + row_information[i] = lower_limit; + row_information[default_comm().size() + i] = higher_limit - lower_limit; + max_row_size_ = std::max(max_row_size_, row_sizes_.back()); + } + row_information[default_comm().size() * 2] = max_row_size_; + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); } - - auto lower_limit = row_offsets_[rank]; - row_size_ = row_sizes_[rank]; - if (row_size_ != get_row_size(rank)) { - fmt::print("hmmmm? {} {} {} {}\n", rank, lower_limit, row_size_, get_row_size(rank)); + else { + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); + for (int i = 0; i < default_comm().size(); i++) { + row_offsets_.push_back(row_information[i]); + row_sizes_.push_back(row_information[default_comm().size() + i]); + } + max_row_size_ = row_information[default_comm().size() * 2]; } - - if (row_size_ > 0) { - rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); - std::copy(csr_view.rowptr_data() + lower_limit, csr_view.rowptr_data() + lower_limit + row_size_, rows_data_); + delete[] row_information; + row_size_ = std::max(row_sizes_[rank], static_cast(1)); + rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + + fence(); + if (rank == root) { + for (std::size_t i = 0; i < default_comm().size(); i++) { + auto lower_limit = row_offsets_[i]; + auto row_size = row_sizes_[i]; + if (row_size > 0) { + rows_backend_.putmem(csr_view.rowptr_data() + lower_limit, 0, row_size * sizeof(I), i); + } + } } + std::size_t segment_index = 0; segment_size_ = vals_data_->segment_size(); assert(segment_size_ == cols_data_->segment_size()); @@ -226,7 +250,7 @@ class csr_eq_distribution { segments_.emplace_back(this, segment_index++, std::min(segment_size_, nnz_ - i), segment_size_); } - + fence(); // for (int i = 0; i < row_size_; i++) { // fmt::print("row, i, rank {} {} {}\n", rows_data_[i], i, rank); // } @@ -235,7 +259,6 @@ class csr_eq_distribution { // fmt::print("val, col, i, rank {} {} {} {}\n", vals_data_->segments()[rank][i], cols_data_->segments()[rank][i],i, rank); // } - fence(); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 8ac447ae3f..789b85951c 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -22,8 +22,8 @@ class csr_row_distribution { csr_row_distribution(csr_row_distribution &&) { assert(false); } /// Constructor - csr_row_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution()) { - init(csr_view, dist); + csr_row_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { + init(csr_view, dist, root); } ~csr_row_distribution() { @@ -69,6 +69,7 @@ template cgh.parallel_for(sycl::range<1> { size }, [=](auto idx) { std::size_t lower_bound = 0; + T sum = 0; if (rows_data[idx] > offset) { lower_bound = rows_data[idx] - offset; } @@ -80,8 +81,9 @@ template auto colNum = local_cols[i]; auto matrixVal = vals[colNum]; auto vectorVal = local_vals[i]; - *(res + idx) += matrixVal * vectorVal; + sum += matrixVal * vectorVal; } + *(res + idx) += sum; }); }).wait(); } @@ -137,50 +139,78 @@ template communicator.gather(partial_res, static_cast(nullptr), segment_size_, root); } } - void init(dr::views::csr_matrix_view csr_view, auto dist) { - nnz_ = csr_view.size(); + void init(dr::views::csr_matrix_view csr_view, auto dist, std::size_t root) { distribution_ = dist; - shape_ = csr_view.shape(); - // determine the distribution of data - // auto hb = dist.halo(); - std::size_t gran = dist.granularity(); - // TODO: make this an error that is reported back to user - assert(nnz_ % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); - // assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); + auto rank = vals_backend_.getrank(); + std::size_t initial_data[3]; + if (root == rank) { + initial_data[0] = csr_view.size(); + initial_data[1] = csr_view.shape().first; + initial_data[2] = csr_view.shape().second; + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } + else { + default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); + } + + nnz_ = initial_data[0]; + shape_ = {initial_data[1], initial_data[2]}; - auto rank = vals_backend_.getrank(); rows_data_ = std::make_shared>(shape_.first); - dr::mp::copy(std::ranges::subrange(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_.first), rows_data_->begin()); + dr::mp::copy(root, std::ranges::subrange(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_.first), rows_data_->begin()); - assert(*csr_view.rowptr_data() == 0); - for (int i = 0; i < default_comm().size(); i++) { - auto first_index = rows_data_->get_segment_offset(i); - if (first_index > shape_.first) { - val_offsets_.push_back(nnz_); - val_sizes_.push_back(0); - continue; + auto row_info_size = default_comm().size() * 2; + std::size_t* val_information = new std::size_t[row_info_size]; + val_offsets_.reserve(row_info_size); + val_sizes_.reserve(row_info_size); + if (rank == root) { + for (int i = 0; i < default_comm().size(); i++) { + auto first_index = rows_data_->get_segment_offset(i); + if (first_index > shape_.first) { + val_offsets_.push_back(nnz_); + val_sizes_.push_back(0); + continue; + } + std::size_t lower_limit = csr_view.rowptr_data()[first_index]; + std::size_t higher_limit = nnz_; + if (rows_data_->get_segment_offset(i + 1) < shape_.first) { + auto last_index = rows_data_->get_segment_offset(i + 1); + higher_limit = csr_view.rowptr_data()[last_index]; + } + val_offsets_.push_back(lower_limit); + val_sizes_.push_back(higher_limit - lower_limit); + val_information[i] = lower_limit; + val_information[i + default_comm().size()] = higher_limit - lower_limit; } - std::size_t lower_limit = csr_view.rowptr_data()[first_index]; - std::size_t higher_limit = nnz_; - if (rows_data_->get_segment_offset(i + 1) < shape_.first) { - auto last_index = rows_data_->get_segment_offset(i + 1); - higher_limit = csr_view.rowptr_data()[last_index]; + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, root); + } + else { + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, root); + for (int i = 0; i < default_comm().size(); i++) { + val_offsets_.push_back(val_information[i]); + val_sizes_.push_back(val_information[default_comm().size() + i]); } - val_offsets_.push_back(lower_limit); - val_sizes_.push_back(higher_limit - lower_limit); } - - auto lower_limit = val_offsets_[rank]; + delete[] val_information; vals_size_ = std::max(val_sizes_[rank], static_cast(1)); // fmt::print("dfsa {} {} {} {}\n", vals_size_, val_sizes_[rank],lower_limit, rank); cols_data_ = static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); vals_data_ = static_cast(vals_backend_.allocate(vals_size_ * sizeof(T))); - std::copy(csr_view.values_data() + lower_limit, csr_view.values_data() + lower_limit + vals_size_, vals_data_); - std::copy(csr_view.colind_data() + lower_limit, csr_view.colind_data() + lower_limit + vals_size_, cols_data_); + + fence(); + if (rank == root) { + for (std::size_t i = 0; i < default_comm().size(); i++) { + auto lower_limit = val_offsets_[i]; + auto row_size = val_sizes_[i]; + if (row_size > 0) { + vals_backend_.putmem(csr_view.values_data() + lower_limit, 0, row_size * sizeof(T), i); + cols_backend_.putmem(csr_view.colind_data() + lower_limit, 0, row_size * sizeof(I), i); + } + } + } std::size_t segment_index = 0; segment_size_ = rows_data_->segment_size(); diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index 6f71923b59..9cbb6b91eb 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -69,7 +69,7 @@ auto create_distributed(dr::views::csr_matrix_view local_mat) { template auto mmread(std::string file_path, const matrix_partition &partition, bool one_indexed = true) { - auto local_mat = read_csr(file_path, one_indexed); + auto local_mat = read_csr(file_path, one_indexed); auto a = create_distributed(local_mat, partition); From 0b498ad31e06e4262cad08933e5d0a46f4b15f67 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 4 Sep 2024 13:25:09 +0200 Subject: [PATCH 10/68] Add concept requirement for gemv operation --- include/dr/mp/algorithms/matrix/gemv.hpp | 1 + include/dr/mp/containers/distributed_sparse_matrix.hpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index bd3d54d58b..8541a1564a 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -12,6 +12,7 @@ namespace dr::mp { template C, rng::input_range B, typename Backend, typename MatDistr> +requires(vector_multiplicable) void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { if (default_comm().rank() == root) { assert(a.shape().first == res.size()); diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index fcbd6d7650..d491d1b4b7 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -20,6 +20,11 @@ concept matrix_distibution = {t.get_segment_from_offset(int())} -> std::same_as; {t.get_id_in_segment(int())} -> std::same_as; T(dr::views::csr_matrix_view(), distribution()); + }; + +template +concept vector_multiplicable = + requires(T t, std::vector res, int* input) { t.local_gemv_and_collect(int(), res, input); }; @@ -145,6 +150,7 @@ class distributed_sparse_matrix { } template + requires(vector_multiplicable) auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { distribution_.local_gemv_and_collect(root, res, vals); } From bbf2acf20d6e7c25e115f36a767df8e88bc9ebaa Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 4 Sep 2024 16:51:53 +0200 Subject: [PATCH 11/68] Initial improvement to matrix reading --- include/dr/detail/local_csr_matrix.hpp | 90 +++++++++++++++++++ include/dr/detail/matrix_io.hpp | 64 ++++++++----- .../matrix_formats/csr_eq_distribution.hpp | 8 +- .../matrix_formats/csr_row_distribution.hpp | 8 +- 4 files changed, 139 insertions(+), 31 deletions(-) create mode 100644 include/dr/detail/local_csr_matrix.hpp diff --git a/include/dr/detail/local_csr_matrix.hpp b/include/dr/detail/local_csr_matrix.hpp new file mode 100644 index 0000000000..f91236a834 --- /dev/null +++ b/include/dr/detail/local_csr_matrix.hpp @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +namespace dr { + +namespace __detail { + +template > +class local_csr_matrix { +public: + using value_type = dr::matrix_entry; + using scalar_type = T; + using index_type = I; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + using allocator_type = Allocator; + + using key_type = dr::index; + using map_type = T; + + using backend_allocator_type = typename std::allocator_traits< + allocator_type>::template rebind_alloc; + using aggregator_allocator_type = typename std::allocator_traits< + allocator_type>::template rebind_alloc>; + using row_type = std::vector; + using backend_type = std::vector; + + using iterator = typename backend_type::iterator; + using const_iterator = typename backend_type::const_iterator; + + local_csr_matrix(dr::index shape) : shape_(shape) { + for (std::size_t i = 0; i < shape.first; i++) { + tuples_.push_back(row_type()); + } + } + + dr::index shape() const noexcept { return shape_; } + + size_type size() const noexcept { return size_; } + + iterator begin() noexcept { return tuples_.begin(); } + + const_iterator begin() const noexcept { return tuples_.begin(); } + + iterator end() noexcept { return tuples_.end(); } + + const_iterator end() const noexcept { return tuples_.end(); } + + template void push_back(InputIt first, InputIt last) { + for (auto iter = first; iter != last; ++iter) { + push_back(*iter); + } + } + + void push_back(const value_type &value) { + size_++; + tuples_[value.index().first].push_back(value); + } + + + void sort() { + for (auto &elem: tuples_) { + std::sort(elem.begin(), elem.end()); + } + } + + local_csr_matrix() = default; + ~local_csr_matrix() = default; + local_csr_matrix(const local_csr_matrix &) = default; + local_csr_matrix(local_csr_matrix &&) = default; + local_csr_matrix &operator=(const local_csr_matrix &) = default; + local_csr_matrix &operator=(local_csr_matrix &&) = default; + +private: + std::size_t size_ = 0; + dr::index shape_; + backend_type tuples_; +}; + +} // namespace __detail + +} // namespace dr diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index 83c8aa49cb..7a77343076 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -13,7 +13,7 @@ #include #include -#include +#include #include namespace dr { @@ -27,7 +27,7 @@ namespace __detail { template auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, Allocator &&allocator) { - auto &&[index, v] = *tuples.begin(); + auto &&[index, v] = *tuples.begin()->begin(); auto &&[i, j] = index; using T = std::remove_reference_t; @@ -45,19 +45,21 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, std::size_t r = 0; std::size_t c = 0; for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { - auto &&[index, value] = *iter; - auto &&[i, j] = index; - - values[c] = value; - colind[c] = j; - - while (r < i) { - assert(r + 1 <= shape[0]); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - rowptr[r + 1] = c; - r++; + for (auto iter2 = iter->begin(); iter2 != iter->end(); ++iter2) { + auto &&[index, value] = *iter2; + auto &&[i, j] = index; + + values[c] = value; + colind[c] = j; + + while (r < i) { + assert(r + 1 <= shape[0]); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + rowptr[r + 1] = c; + r++; + } + c++; } - c++; assert(c <= nnz); // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); @@ -74,9 +76,10 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, /// Read in the Matrix Market file at location `file_path` and a return /// a coo_matrix data structure with its contents. template -inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed = true) { +inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_indexed = true) { using size_type = std::size_t; + auto begin = std::chrono::high_resolution_clock::now(); std::ifstream f; f.open(file_path.c_str()); @@ -146,12 +149,7 @@ inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed // NOTE for symmetric matrices: `nnz` holds the number of stored values in // the matrix market file, while `matrix.nnz_` will hold the total number of // stored values (including "mirrored" symmetric values). - coo_matrix matrix({m, n}); - if (symmetric) { - matrix.reserve(2 * nnz); - } else { - matrix.reserve(nnz); - } + local_csr_matrix matrix({m, n}); size_type c = 0; while (std::getline(f, buf)) { @@ -186,7 +184,16 @@ inline coo_matrix read_coo_matrix(std::string file_path, bool one_indexed "file, file has more nonzeros than reported."); } } - std::sort(matrix.begin(), matrix.end()); + + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + fmt::print("No sort read time {}\n", duration * 1000); + + begin = std::chrono::high_resolution_clock::now(); + matrix.sort(); + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration(end - begin).count(); + fmt::print("Sort time {}\n", duration * 1000); f.close(); return matrix; @@ -206,10 +213,21 @@ void destroy_csr_matrix_view(dr::views::csr_matrix_view view, template auto read_csr(std::string file_path, bool one_indexed = true) { + auto begin = std::chrono::high_resolution_clock::now(); auto m = __detail::read_coo_matrix(file_path, one_indexed); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + fmt::print("Read time {}\n", duration * 1000); + auto shape = m.shape(); auto nnz = m.size(); - return __detail::convert_to_csr(m, shape, nnz, std::allocator{}); + begin = std::chrono::high_resolution_clock::now(); + auto t = __detail::convert_to_csr(m, shape, nnz, std::allocator{}); + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration(end - begin).count(); + fmt::print("Conversion time {}\n", duration * 1000); + + return t; } } \ No newline at end of file diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 5c8eb42bbb..5fa3ac3268 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -135,11 +135,11 @@ class csr_eq_distribution { for (auto i = 0; i < max_row_size_; i++) { res_alloc[i] = 0; } - auto begin = std::chrono::high_resolution_clock::now(); + // auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count(); - fmt::print("eq gemv time {}\n", duration * 1000); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("eq gemv time {}\n", duration * 1000); gather_gemv_vector(root, res, res_alloc); alloc.deallocate(res_alloc, max_row_size_); diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 789b85951c..2baa48d740 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -113,11 +113,11 @@ template res_alloc[i] = 0; } - auto begin = std::chrono::high_resolution_clock::now(); + // auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count(); - fmt::print("rows gemv time {}\n", duration * 1000); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("rows gemv time {}\n", duration * 1000); gather_gemv_vector(root, res, res_alloc); alloc.deallocate(res_alloc, segment_size_); From 9eca244b67c83f9c835290e7294e57e3b697c718 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 9 Sep 2024 10:21:06 +0200 Subject: [PATCH 12/68] Add small improvements to matrix loading --- include/dr/detail/local_csr_matrix.hpp | 18 +++++++++++------ include/dr/detail/matrix_io.hpp | 28 ++++++++++---------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/dr/detail/local_csr_matrix.hpp b/include/dr/detail/local_csr_matrix.hpp index f91236a834..a7ecbf6a78 100644 --- a/include/dr/detail/local_csr_matrix.hpp +++ b/include/dr/detail/local_csr_matrix.hpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace dr { @@ -15,7 +16,7 @@ namespace __detail { template > class local_csr_matrix { public: - using value_type = dr::matrix_entry; + using value_type = std::pair; using scalar_type = T; using index_type = I; using size_type = std::size_t; @@ -36,9 +37,11 @@ class local_csr_matrix { using iterator = typename backend_type::iterator; using const_iterator = typename backend_type::const_iterator; - local_csr_matrix(dr::index shape) : shape_(shape) { + local_csr_matrix(dr::index shape, std::size_t nnz) : shape_(shape) { + auto average_size = nnz / shape.first / 2; for (std::size_t i = 0; i < shape.first; i++) { tuples_.push_back(row_type()); + tuples_.back().reserve(average_size); } } @@ -60,15 +63,18 @@ class local_csr_matrix { } } - void push_back(const value_type &value) { + void push_back(index_type row, const value_type &value) { + tuples_[row].push_back(value); size_++; - tuples_[value.index().first].push_back(value); - } + } void sort() { + auto comparator = [](auto &one, auto& two) { + return one.second < two.second; + }; for (auto &elem: tuples_) { - std::sort(elem.begin(), elem.end()); + std::sort(elem.begin(), elem.end(), comparator); } } diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index 7a77343076..a2d6337999 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -25,13 +25,12 @@ namespace __detail { // 2) `tuples` has shape `shape` // 3) `tuples` has `nnz` elements template -auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, +auto convert_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, Allocator &&allocator) { - auto &&[index, v] = *tuples.begin()->begin(); - auto &&[i, j] = index; + auto &&[v, j] = *csr_matrix.begin()->begin(); using T = std::remove_reference_t; - using I = std::remove_reference_t; + using I = std::remove_reference_t; typename std::allocator_traits::template rebind_alloc i_allocator(allocator); @@ -44,22 +43,17 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, std::size_t r = 0; std::size_t c = 0; - for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { + for (auto iter = csr_matrix.begin(); iter != csr_matrix.end(); ++iter) { for (auto iter2 = iter->begin(); iter2 != iter->end(); ++iter2) { - auto &&[index, value] = *iter2; - auto &&[i, j] = index; + auto &&[value, j] = *iter2; values[c] = value; colind[c] = j; - - while (r < i) { - assert(r + 1 <= shape[0]); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - rowptr[r + 1] = c; - r++; - } c++; } + assert(r + 1 <= shape[0]); + rowptr[r + 1] = c; + r++; assert(c <= nnz); // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); @@ -149,7 +143,7 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_in // NOTE for symmetric matrices: `nnz` holds the number of stored values in // the matrix market file, while `matrix.nnz_` will hold the total number of // stored values (including "mirrored" symmetric values). - local_csr_matrix matrix({m, n}); + local_csr_matrix matrix({m, n}, nnz); size_type c = 0; while (std::getline(f, buf)) { @@ -172,10 +166,10 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_in "read_MatrixMarket: file has nonzero out of bounds."); } - matrix.push_back({{i, j}, v}); + matrix.push_back(i, {v, j}); if (symmetric && i != j) { - matrix.push_back({{j, i}, v}); + matrix.push_back(j, {v, i}); } c++; From 7b55a1b7b137fd8c528888c75097e74652760404 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 9 Sep 2024 12:41:35 +0200 Subject: [PATCH 13/68] Fix formatting --- examples/mp/CMakeLists.txt | 1 - include/dr/detail/communicator.hpp | 6 +- include/dr/detail/generate_random_csr.hpp | 2 +- include/dr/detail/local_csr_matrix.hpp | 11 +- include/dr/detail/matrix_entry.hpp | 11 +- include/dr/detail/matrix_io.hpp | 13 +- include/dr/mp/algorithms/matrix/gemv.hpp | 55 +-- .../containers/distributed_sparse_matrix.hpp | 60 +-- .../dr/mp/containers/distributed_vector.hpp | 4 +- .../matrix_formats/csr_eq_distribution.hpp | 366 ++++++++++-------- .../matrix_formats/csr_eq_segment.hpp | 61 +-- .../matrix_formats/csr_row_distribution.hpp | 299 +++++++------- .../matrix_formats/csr_row_segment.hpp | 58 +-- .../dr/sp/algorithms/matrix/local_gemv.hpp | 18 +- .../containers/distributed_dense_matrix.hpp | 2 +- include/dr/sp/containers/sparse_matrix.hpp | 4 +- include/dr/sp/util/matrix_io.hpp | 9 +- include/dr/views/csr_matrix_view.hpp | 2 +- 18 files changed, 523 insertions(+), 459 deletions(-) diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 4d51ceb5f3..f97a527bbb 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -28,7 +28,6 @@ add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) add_mp_example(sparse_matrix) - if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) target_link_libraries(vector-add-ref PRIVATE MPI::MPI_CXX OpenMP::OpenMP_CXX diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index ceb7141171..74dabe05cf 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -62,9 +62,9 @@ class communicator { } template - void gather(const T *src, T *dst, std::size_t count, - std::size_t root) const { - MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), MPI_BYTE, root, mpi_comm_); + void gather(const T *src, T *dst, std::size_t count, std::size_t root) const { + MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), + MPI_BYTE, root, mpi_comm_); } template diff --git a/include/dr/detail/generate_random_csr.hpp b/include/dr/detail/generate_random_csr.hpp index 45c175c3a8..fc84dd1263 100644 --- a/include/dr/detail/generate_random_csr.hpp +++ b/include/dr/detail/generate_random_csr.hpp @@ -89,4 +89,4 @@ auto generate_random_csr(dr::index shape, double density = 0.01, return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0); } -} // namespace dr::sp +} // namespace dr diff --git a/include/dr/detail/local_csr_matrix.hpp b/include/dr/detail/local_csr_matrix.hpp index a7ecbf6a78..3eb44b6dca 100644 --- a/include/dr/detail/local_csr_matrix.hpp +++ b/include/dr/detail/local_csr_matrix.hpp @@ -5,9 +5,9 @@ #pragma once #include +#include #include #include -#include namespace dr { @@ -63,17 +63,16 @@ class local_csr_matrix { } } - void push_back(index_type row, const value_type &value) { - tuples_[row].push_back(value); + void push_back(index_type row, const value_type &value) { + tuples_[row].push_back(value); size_++; } - void sort() { - auto comparator = [](auto &one, auto& two) { + auto comparator = [](auto &one, auto &two) { return one.second < two.second; }; - for (auto &elem: tuples_) { + for (auto &elem : tuples_) { std::sort(elem.begin(), elem.end(), comparator); } } diff --git a/include/dr/detail/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp index 6b35e12a01..bffe596654 100644 --- a/include/dr/detail/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -11,12 +11,11 @@ #include namespace dr { -template - concept getable = requires(T x) - { - std::get<0>(x); - std::get<1>(x); - }; +template +concept getable = requires(T x) { + std::get<0>(x); + std::get<1>(x); +}; template class matrix_entry { public: using index_type = I; diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index a2d6337999..eedb0d93b9 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -6,12 +6,12 @@ #include #include +#include #include #include #include #include #include -#include #include #include @@ -64,13 +64,14 @@ auto convert_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, } return dr::views::csr_matrix_view(values, rowptr, colind, - dr::index(shape[0], shape[1]), nnz, 0); + dr::index(shape[0], shape[1]), nnz, 0); } /// Read in the Matrix Market file at location `file_path` and a return /// a coo_matrix data structure with its contents. template -inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_indexed = true) { +inline local_csr_matrix read_coo_matrix(std::string file_path, + bool one_indexed = true) { using size_type = std::size_t; auto begin = std::chrono::high_resolution_clock::now(); @@ -178,11 +179,11 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_in "file, file has more nonzeros than reported."); } } - + auto end = std::chrono::high_resolution_clock::now(); double duration = std::chrono::duration(end - begin).count(); fmt::print("No sort read time {}\n", duration * 1000); - + begin = std::chrono::high_resolution_clock::now(); matrix.sort(); end = std::chrono::high_resolution_clock::now(); @@ -224,4 +225,4 @@ auto read_csr(std::string file_path, bool one_indexed = true) { return t; } -} \ No newline at end of file +} // namespace dr diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index 8541a1564a..eb152adac7 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -3,38 +3,39 @@ // SPDX-License-Identifier: BSD-3-Clause #pragma once -#include +#include #include #include -#include #include +#include namespace dr::mp { -template C, rng::input_range B, typename Backend, typename MatDistr> -requires(vector_multiplicable) -void gemv(int root, C &res, distributed_sparse_matrix &a, B &b) { - if (default_comm().rank() == root) { - assert(a.shape().first == res.size()); - assert(a.shape().second == b.size()); - } - // copy b to all machines - auto communicator = default_comm(); - __detail::allocator alloc; - auto broadcasted_b = alloc.allocate(a.shape().second); - if (communicator.rank() == root) { - rng::copy(b.begin(), b.end(), broadcasted_b); - } - communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); - a.local_gemv_and_collect(root, res, broadcasted_b); - alloc.deallocate(broadcasted_b, a.shape().second); - // a.fence(); - // if (default_comm().rank() == root) { - // for (int i = 0; i < a.shape().first; i++) { - // fmt::print("Result {} {}\n", i, res[i]); - // } - // } - +template C, rng::input_range B, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(int root, C &res, + distributed_sparse_matrix &a, B &b) { + if (default_comm().rank() == root) { + assert(a.shape().first == res.size()); + assert(a.shape().second == b.size()); + } + // copy b to all machines + auto communicator = default_comm(); + __detail::allocator alloc; + auto broadcasted_b = alloc.allocate(a.shape().second); + if (communicator.rank() == root) { + rng::copy(b.begin(), b.end(), broadcasted_b); + } + communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); + a.local_gemv_and_collect(root, res, broadcasted_b); + alloc.deallocate(broadcasted_b, a.shape().second); + // a.fence(); + // if (default_comm().rank() == root) { + // for (int i = 0; i < a.shape().first; i++) { + // fmt::print("Result {} {}\n", i, res[i]); + // } + // } } -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index d491d1b4b7..9e2b22aefc 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -2,34 +2,33 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once +#include #include #include -#include -#include - +#include namespace dr::mp { template -concept matrix_distibution = - requires(T t, std::vector res, int* input) { - {t.fence()} -> std::same_as; - { t.segments() } -> rng::random_access_range; - {t.shape().first} -> std::convertible_to; - {t.shape().second} -> std::convertible_to; - {t.nnz()} -> std::same_as; - {t.get_segment_from_offset(int())} -> std::same_as; - {t.get_id_in_segment(int())} -> std::same_as; - T(dr::views::csr_matrix_view(), distribution()); - }; +concept matrix_distibution = requires(T t, std::vector res, int *input) { + { t.fence() } -> std::same_as; + { t.segments() } -> rng::random_access_range; + { t.shape().first } -> std::convertible_to; + { t.shape().second } -> std::convertible_to; + { t.nnz() } -> std::same_as; + { t.get_segment_from_offset(int()) } -> std::same_as; + { t.get_id_in_segment(int()) } -> std::same_as; + T(dr::views::csr_matrix_view(), + distribution()); +}; template -concept vector_multiplicable = - requires(T t, std::vector res, int* input) { - t.local_gemv_and_collect(int(), res, input); - }; +concept vector_multiplicable = requires(T t, std::vector res, int *input) { + t.local_gemv_and_collect(int(), res, input); +}; -template > -requires(matrix_distibution) +template > + requires(matrix_distibution) class distributed_sparse_matrix { public: @@ -124,11 +123,15 @@ class distributed_sparse_matrix { }; distributed_sparse_matrix(const distributed_sparse_matrix &) = delete; - distributed_sparse_matrix &operator=(const distributed_sparse_matrix &) = delete; + distributed_sparse_matrix & + operator=(const distributed_sparse_matrix &) = delete; distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); } /// Constructor - distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, std::size_t root = 0, distribution dist = distribution()): distribution_(csr_view, dist, root) {} + distributed_sparse_matrix(dr::views::csr_matrix_view csr_view, + std::size_t root = 0, + distribution dist = distribution()) + : distribution_(csr_view, dist, root) {} /// Returns iterator to beginning auto begin() const { return iterator(this, 0); } @@ -141,22 +144,19 @@ class distributed_sparse_matrix { auto shape() const { return distribution_.shape(); } /// Returns reference using index auto operator[](difference_type n) const { return *(begin() + n); } -// auto &halo() const { return *halo_; } TODO + // auto &halo() const { return *halo_; } TODO auto segments() const { return distribution_.segments(); } - void fence() { - distribution_.fence(); - } + void fence() { distribution_.fence(); } - template - requires(vector_multiplicable) + template + requires(vector_multiplicable) auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { distribution_.local_gemv_and_collect(root, res, vals); } private: MatrixDistrT distribution_; - }; -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 79c1845289..822a1b1597 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,9 +276,7 @@ template class distributed_vector { void fence() { backend.fence(); } - auto segment_size() const { - return segment_size_; - } + auto segment_size() const { return segment_size_; } auto get_segment_offset(std::size_t segment_id) const { return segment_id * segment_size_; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 5fa3ac3268..7968ff30f5 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -2,183 +2,196 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once +#include #include -#include -#include +#include namespace dr::mp { -template +template class csr_eq_distribution { public: - using value_type = dr::matrix_entry; - using elem_type = T; - using index_type = I; - using difference_type = std::ptrdiff_t; + using value_type = dr::matrix_entry; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; - csr_eq_distribution(const csr_eq_distribution &) = delete; - csr_eq_distribution &operator=(const csr_eq_distribution &) = delete; - csr_eq_distribution(csr_eq_distribution &&) { assert(false); } + csr_eq_distribution(const csr_eq_distribution &) = delete; + csr_eq_distribution &operator=(const csr_eq_distribution &) = delete; + csr_eq_distribution(csr_eq_distribution &&) { assert(false); } - /// Constructor - csr_eq_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { - init(csr_view, dist, root); - } + /// Constructor + csr_eq_distribution(dr::views::csr_matrix_view csr_view, + distribution dist = distribution(), + std::size_t root = 0) { + init(csr_view, dist, root); + } - ~csr_eq_distribution() { - if (!finalized()) { - fence(); - if (rows_data_ != nullptr) { - rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); - } + ~csr_eq_distribution() { + if (!finalized()) { + fence(); + if (rows_data_ != nullptr) { + rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); + } - // delete halo_; TODO - } + // delete halo_; TODO } - std::size_t get_id_in_segment(std::size_t offset) const { - return offset % segment_size_; - } - std::size_t get_segment_from_offset(std::size_t offset) const { - return offset / segment_size_; - } - auto segments() const { return rng::views::all(segments_); } - auto nnz() const {return nnz_;} - auto shape() const {return shape_;} - void fence() { - rows_backend_.fence(); + } + std::size_t get_id_in_segment(std::size_t offset) const { + return offset % segment_size_; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + return offset / segment_size_; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const { return nnz_; } + auto shape() const { return shape_; } + void fence() { rows_backend_.fence(); } + + template auto local_gemv(C &res, A &vals) const { + auto rank = rows_backend_.getrank(); + if (nnz_ <= segment_size_ * rank) { + return; } + auto size = row_sizes_[rank]; + if (dr::mp::use_sycl()) { + auto localVals = dr::__detail::direct_iterator( + dr::mp::local_segment(*vals_data_).begin()); + auto localCols = dr::__detail::direct_iterator( + dr::mp::local_segment(*cols_data_).begin()); + auto offset = rank * segment_size_; + auto real_segment_size = + std::min(nnz_ - rank * segment_size_, segment_size_); + auto local_data = rows_data_; + // dr::mp::sycl_queue().submit([&](auto& cgh) { + // cgh.parallel_for(sycl::range<1> { real_segment_size }, + // [=](auto idx) { + // auto colNum = localCols[idx]; + // auto matrixVal = vals[colNum]; + // auto vectorVal = localVals[idx]; + // auto row = + // rng::distance(std::upper_bound(local_data, + // local_data + row_size, offset + idx), local_data) - + // 1; + // *(res + row) += matrixVal * vectorVal; + // }); + // }).wait(); + auto one_computation_size = + (real_segment_size + max_row_size_ - 1) / max_row_size_; + auto row_size = row_size_; + dr::mp::sycl_queue() + .submit([&](auto &cgh) { + cgh.parallel_for(sycl::range<1>{max_row_size_}, [=](auto idx) { + std::size_t lower_bound = one_computation_size * idx; + std::size_t upper_bound = + std::min(one_computation_size * (idx + 1), real_segment_size); + std::size_t position = lower_bound + offset; + std::size_t first_row = rng::distance( + local_data, std::upper_bound( + local_data, local_data + row_size, position) - + 1); + auto row = first_row; + T sum = 0; + for (auto i = lower_bound; i < upper_bound; i++) { + while (row + 1 < row_size && + local_data[row + 1] <= offset + i) { + sycl::atomic_ref + c_ref(res[row]); + c_ref += sum; + row++; + sum = 0; + } + auto colNum = localCols[i]; + auto matrixVal = vals[colNum]; + auto vectorVal = localVals[i]; - template - auto local_gemv(C &res, A &vals) const { - auto rank = rows_backend_.getrank(); - if (nnz_ <= segment_size_ * rank) { - return; - } - auto size = row_sizes_[rank]; - if (dr::mp::use_sycl()) { - auto localVals = dr::__detail::direct_iterator(dr::mp::local_segment(*vals_data_).begin()); - auto localCols = dr::__detail::direct_iterator(dr::mp::local_segment(*cols_data_).begin()); - auto offset = rank * segment_size_; - auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); - auto local_data = rows_data_; - // dr::mp::sycl_queue().submit([&](auto& cgh) { - // cgh.parallel_for(sycl::range<1> { real_segment_size }, - // [=](auto idx) { - // auto colNum = localCols[idx]; - // auto matrixVal = vals[colNum]; - // auto vectorVal = localVals[idx]; - // auto row = std::distance(std::upper_bound(local_data, local_data + row_size, offset + idx), local_data) - 1; - // *(res + row) += matrixVal * vectorVal; - // }); - // }).wait(); - auto one_computation_size = (real_segment_size + max_row_size_ - 1) / max_row_size_; - auto row_size = row_size_; - dr::mp::sycl_queue().submit([&](auto& cgh) { - cgh.parallel_for(sycl::range<1> { max_row_size_ }, - [=](auto idx) { - std::size_t lower_bound = one_computation_size * idx; - std::size_t upper_bound = std::min(one_computation_size * (idx + 1), real_segment_size); - std::size_t position = lower_bound + offset; - std::size_t first_row = std::distance(local_data, std::upper_bound(local_data, local_data + row_size, position) - 1); - auto row = first_row; - T sum = 0; - for (auto i = lower_bound; i < upper_bound; i++) { - while (row + 1 < row_size && local_data[row + 1] <= offset + i) { - sycl::atomic_ref - c_ref(res[row]); - c_ref += sum; - row++; - sum = 0; - } - auto colNum = localCols[i]; - auto matrixVal = vals[colNum]; - auto vectorVal = localVals[i]; - - sum += matrixVal * vectorVal; - } - sycl::atomic_ref - c_ref(res[row]); - c_ref += sum; - }); - }).wait(); - } - else { - auto row_i = -1; - auto position = segment_size_ * rank; - auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank); - auto current_row_position = rows_data_[0]; - auto local_vals = dr::mp::local_segment(*vals_data_); - auto local_cols = dr::mp::local_segment(*cols_data_); + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(res[row]); + c_ref += sum; + }); + }) + .wait(); + } else { + auto row_i = -1; + auto position = segment_size_ * rank; + auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank); + auto current_row_position = rows_data_[0]; + auto local_vals = dr::mp::local_segment(*vals_data_); + auto local_cols = dr::mp::local_segment(*cols_data_); - for (int i = 0; i < elem_count; i++) { - while (row_i + 1 < size && position + i >= current_row_position) { - row_i++; - current_row_position = rows_data_[row_i + 1]; - } - res[row_i] += local_vals[i] * vals[local_cols[i]]; + for (int i = 0; i < elem_count; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = rows_data_[row_i + 1]; } - - // fmt::print("offset, rank {} {}\n", row_offsets_[ rows_backend_.getrank()], rows_backend_.getrank()); - // for (int i = 0; i < size; i++) { - // fmt::print("ledata, rank, i {} {} {}\n", res[i], rows_backend_.getrank(), i); - // } + res[row_i] += local_vals[i] * vals[local_cols[i]]; } + + // fmt::print("offset, rank {} {}\n", row_offsets_[ + // rows_backend_.getrank()], rows_backend_.getrank()); for (int i = 0; i + // < size; i++) { + // fmt::print("ledata, rank, i {} {} {}\n", res[i], + // rows_backend_.getrank(), i); + // } } - - template - auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { - assert(res.size() == shape_.first); - __detail::allocator alloc; - auto res_alloc = alloc.allocate(max_row_size_); - for (auto i = 0; i < max_row_size_; i++) { - res_alloc[i] = 0; - } - // auto begin = std::chrono::high_resolution_clock::now(); - local_gemv(res_alloc, vals); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("eq gemv time {}\n", duration * 1000); + } - gather_gemv_vector(root, res, res_alloc); - alloc.deallocate(res_alloc, max_row_size_); + template + auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + assert(res.size() == shape_.first); + __detail::allocator alloc; + auto res_alloc = alloc.allocate(max_row_size_); + for (auto i = 0; i < max_row_size_; i++) { + res_alloc[i] = 0; } + // auto begin = std::chrono::high_resolution_clock::now(); + local_gemv(res_alloc, vals); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("eq gemv time {}\n", duration * 1000); + + gather_gemv_vector(root, res, res_alloc); + alloc.deallocate(res_alloc, max_row_size_); + } + private: friend csr_eq_segment_iterator; - - template + + template void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { - auto communicator = default_comm(); - __detail::allocator alloc; - if (communicator.rank() == root) { - auto gathered_res = alloc.allocate(max_row_size_ * communicator.size()); - communicator.gather(partial_res, gathered_res, max_row_size_, root); - rng::fill(res, 0); - - // auto begin = std::chrono::high_resolution_clock::now(); - for (auto i = 0; i < communicator.size(); i++) { - auto first_row = row_offsets_[i]; - auto last_row = row_offsets_[i] + row_sizes_[i]; - for (auto j = first_row; j < last_row; j++) { - res[j] += gathered_res[max_row_size_ * i + j - first_row]; - } - } - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("gather time {}\n", duration); - alloc.deallocate(gathered_res, max_row_size_ * communicator.size()); - } - else { - communicator.gather(partial_res, static_cast(nullptr), max_row_size_, root); + auto communicator = default_comm(); + __detail::allocator alloc; + if (communicator.rank() == root) { + auto gathered_res = alloc.allocate(max_row_size_ * communicator.size()); + communicator.gather(partial_res, gathered_res, max_row_size_, root); + rng::fill(res, 0); + + // auto begin = std::chrono::high_resolution_clock::now(); + for (auto i = 0; i < communicator.size(); i++) { + auto first_row = row_offsets_[i]; + auto last_row = row_offsets_[i] + row_sizes_[i]; + for (auto j = first_row; j < last_row; j++) { + res[j] += gathered_res[max_row_size_ * i + j - first_row]; + } } + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("gather time {}\n", duration); + alloc.deallocate(gathered_res, max_row_size_ * communicator.size()); + } else { + communicator.gather(partial_res, static_cast(nullptr), max_row_size_, + root); + } } - std::size_t get_row_size(std::size_t rank) { - return row_sizes_[rank]; - } + std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; } - void init(dr::views::csr_matrix_view csr_view, auto dist, std::size_t root) { + void init(dr::views::csr_matrix_view csr_view, auto dist, + std::size_t root) { distribution_ = dist; auto rank = rows_backend_.getrank(); @@ -188,8 +201,7 @@ class csr_eq_distribution { initial_data[1] = csr_view.shape().first; initial_data[2] = csr_view.shape().second; default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); - } - else { + } else { default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); } @@ -197,20 +209,34 @@ class csr_eq_distribution { shape_ = {initial_data[1], initial_data[2]}; vals_data_ = std::make_shared>(nnz_); cols_data_ = std::make_shared>(nnz_); - dr::mp::copy(root, std::ranges::subrange(csr_view.values_data(), csr_view.values_data() + nnz_), vals_data_->begin()); - dr::mp::copy(root, std::ranges::subrange(csr_view.colind_data(), csr_view.colind_data() + nnz_), cols_data_->begin()); + dr::mp::copy(root, + std::ranges::subrange(csr_view.values_data(), + csr_view.values_data() + nnz_), + vals_data_->begin()); + dr::mp::copy(root, + std::ranges::subrange(csr_view.colind_data(), + csr_view.colind_data() + nnz_), + cols_data_->begin()); auto row_info_size = default_comm().size() * 2 + 1; __detail::allocator alloc; - std::size_t* row_information = new std::size_t[row_info_size]; + std::size_t *row_information = new std::size_t[row_info_size]; row_offsets_.reserve(default_comm().size()); row_sizes_.reserve(default_comm().size()); if (root == default_comm().rank()) { for (int i = 0; i < default_comm().size(); i++) { auto first_index = vals_data_->get_segment_offset(i); auto last_index = vals_data_->get_segment_offset(i + 1) - 1; - auto lower_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], first_index)) - 1; - auto higher_limit = std::distance(csr_view.rowptr_data(), std::upper_bound(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_[0], last_index)); + auto lower_limit = + rng::distance(csr_view.rowptr_data(), + std::upper_bound(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_[0], + first_index)) - + 1; + auto higher_limit = rng::distance( + csr_view.rowptr_data(), + std::upper_bound(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_[0], last_index)); row_offsets_.push_back(lower_limit); row_sizes_.push_back(higher_limit - lower_limit); row_information[i] = lower_limit; @@ -218,10 +244,11 @@ class csr_eq_distribution { max_row_size_ = std::max(max_row_size_, row_sizes_.back()); } row_information[default_comm().size() * 2] = max_row_size_; - default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); - } - else { - default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, + root); + } else { + default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, + root); for (int i = 0; i < default_comm().size(); i++) { row_offsets_.push_back(row_information[i]); row_sizes_.push_back(row_information[default_comm().size() + i]); @@ -230,7 +257,8 @@ class csr_eq_distribution { } delete[] row_information; row_size_ = std::max(row_sizes_[rank], static_cast(1)); - rows_data_ = static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); + rows_data_ = + static_cast(rows_backend_.allocate(row_size_ * sizeof(I))); fence(); if (rank == root) { @@ -238,7 +266,8 @@ class csr_eq_distribution { auto lower_limit = row_offsets_[i]; auto row_size = row_sizes_[i]; if (row_size > 0) { - rows_backend_.putmem(csr_view.rowptr_data() + lower_limit, 0, row_size * sizeof(I), i); + rows_backend_.putmem(csr_view.rowptr_data() + lower_limit, 0, + row_size * sizeof(I), i); } } } @@ -256,19 +285,18 @@ class csr_eq_distribution { // } // fence(); // for (int i = 0; i < vals_data_->segments()[rank].size(); i++) { - // fmt::print("val, col, i, rank {} {} {} {}\n", vals_data_->segments()[rank][i], cols_data_->segments()[rank][i],i, rank); + // fmt::print("val, col, i, rank {} {} {} {}\n", + // vals_data_->segments()[rank][i], cols_data_->segments()[rank][i],i, + // rank); // } - } - std::size_t segment_size_ = 0; std::size_t row_size_ = 0; std::size_t max_row_size_ = 0; std::vector row_offsets_; std::vector row_sizes_; - index_type *rows_data_ = nullptr; BackendT rows_backend_; @@ -279,4 +307,4 @@ class csr_eq_distribution { std::shared_ptr> vals_data_; std::shared_ptr> cols_data_; }; -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index af62dee919..f08c1dc4ef 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -13,8 +13,8 @@ template class csr_eq_segment_reference { public: using value_type = typename DSM::value_type; - using index_type = typename DSM::index_type; - using elem_type = typename DSM::elem_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; csr_eq_segment_reference(const iterator it) : iterator_(it) {} @@ -49,12 +49,13 @@ template class csr_eq_segment_reference { template class csr_eq_segment_iterator { public: using value_type = typename DSM::value_type; - using index_type = typename DSM::index_type; - using elem_type = typename DSM::elem_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; using difference_type = typename DSM::difference_type; csr_eq_segment_iterator() = default; - csr_eq_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + csr_eq_segment_iterator(DSM *dsm, std::size_t segment_index, + std::size_t index) { dsm_ = dsm; segment_index_ = segment_index; index_ = index; @@ -85,7 +86,8 @@ template class csr_eq_segment_iterator { auto &operator-=(difference_type n) { return *this += (-n); } - difference_type operator-(const csr_eq_segment_iterator &other) const noexcept { + difference_type + operator-(const csr_eq_segment_iterator &other) const noexcept { assert(dsm_ != nullptr && dsm_ == other.dsm_); assert(index_ >= other.index_); return index_ - other.index_; @@ -125,7 +127,8 @@ template class csr_eq_segment_iterator { } // When *this is not first in the expression - friend auto operator+(difference_type n, const csr_eq_segment_iterator &other) { + friend auto operator+(difference_type n, + const csr_eq_segment_iterator &other) { return other + n; } @@ -158,9 +161,10 @@ template class csr_eq_segment_iterator { void get_value(elem_type *dst, std::size_t size) const { assert(dsm_ != nullptr); assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); - (dsm_->vals_data_->segments()[segment_index_].begin() + index_).get(dst, size); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_) + .get(dst, size); } - + elem_type get_value() const { elem_type val; get_value(&val, 1); @@ -171,25 +175,27 @@ template class csr_eq_segment_iterator { assert(dsm_ != nullptr); assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); auto col_data = new index_type[size]; - (dsm_->cols_data_->segments()[segment_index_].begin() + index_).get(col_data, size); + (dsm_->cols_data_->segments()[segment_index_].begin() + index_) + .get(col_data, size); index_type *rows; std::size_t rows_length = dsm_->get_row_size(segment_index_); - + if (rank() == dsm_->rows_backend_.getrank()) { rows = dsm_->rows_data_; - } - else { + } else { rows = new index_type[rows_length]; - dsm_->rows_backend_.getmem(rows, 0, rows_length * sizeof(index_type), segment_index_); + dsm_->rows_backend_.getmem(rows, 0, rows_length * sizeof(index_type), + segment_index_); } - auto position = dsm_->cols_data_->get_segment_offset(segment_index_) + index_; + auto position = + dsm_->cols_data_->get_segment_offset(segment_index_) + index_; auto rows_iter = rows + 1; auto cols_iter = col_data; auto iter = dst; std::size_t current_row = dsm_->row_offsets_[segment_index_]; std::size_t last_row = current_row + rows_length - 1; for (int i = 0; i < size; i++) { - while (current_row < last_row && *rows_iter <= position + i ) { + while (current_row < last_row && *rows_iter <= position + i) { rows_iter++; current_row++; } @@ -202,7 +208,6 @@ template class csr_eq_segment_iterator { delete[] rows; } delete[] col_data; - } dr::index get_index() const { @@ -214,7 +219,8 @@ template class csr_eq_segment_iterator { void put(const value_type *dst, std::size_t size) const { assert(dsm_ != nullptr); assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); - (dsm_->vals_data_->segments()[segment_index_].begin() + index_).put(dst, size); + (dsm_->vals_data_->segments()[segment_index_].begin() + index_) + .put(dst, size); } void put(const value_type &value) const { put(&value, 1); } @@ -226,7 +232,8 @@ template class csr_eq_segment_iterator { auto segments() const { assert(dsm_ != nullptr); - return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, + index_); } private: @@ -245,7 +252,7 @@ template class csr_eq_segment { using difference_type = std::ptrdiff_t; csr_eq_segment() = default; csr_eq_segment(DSM *dsm, std::size_t segment_index, std::size_t size, - std::size_t reserved) { + std::size_t reserved) { dsm_ = dsm; segment_index_ = segment_index; size_ = size; @@ -276,11 +283,13 @@ template class csr_eq_segment { } // namespace dr::mp namespace std { - template - struct tuple_size> : std::integral_constant {}; - - template - struct tuple_element> - : tuple_element, typename DSM::elem_type>> {}; +template +struct tuple_size> + : std::integral_constant {}; + +template +struct tuple_element> + : tuple_element, + typename DSM::elem_type>> {}; } // namespace std diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 2baa48d740..209ceafe19 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -2,144 +2,151 @@ // // SPDX-License-Identifier: BSD-3-Clause #pragma once -#include #include +#include #include #include namespace dr::mp { -template +template class csr_row_distribution { public: - using value_type = dr::matrix_entry; - using elem_type = T; - using index_type = I; - using difference_type = std::ptrdiff_t; + using value_type = dr::matrix_entry; + using elem_type = T; + using index_type = I; + using difference_type = std::ptrdiff_t; - csr_row_distribution(const csr_row_distribution &) = delete; - csr_row_distribution &operator=(const csr_row_distribution &) = delete; - csr_row_distribution(csr_row_distribution &&) { assert(false); } + csr_row_distribution(const csr_row_distribution &) = delete; + csr_row_distribution &operator=(const csr_row_distribution &) = delete; + csr_row_distribution(csr_row_distribution &&) { assert(false); } - /// Constructor - csr_row_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { - init(csr_view, dist, root); - } + /// Constructor + csr_row_distribution(dr::views::csr_matrix_view csr_view, + distribution dist = distribution(), + std::size_t root = 0) { + init(csr_view, dist, root); + } - ~csr_row_distribution() { - if (!finalized()) { - fence(); - if (vals_data_ != nullptr) { - vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); - cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); - } + ~csr_row_distribution() { + if (!finalized()) { + fence(); + if (vals_data_ != nullptr) { + vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); + cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); + } - // delete halo_; TODO - } - } - std::size_t get_id_in_segment(std::size_t offset) const { - assert(offset < nnz_); - auto pos_iter = std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset) - 1; - return offset - *pos_iter; + // delete halo_; TODO } - std::size_t get_segment_from_offset(std::size_t offset) const { - assert(offset < nnz_); - auto pos_iter = std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset); - return rng::distance(val_offsets_.begin(), pos_iter) - 1; - } - auto segments() const { return rng::views::all(segments_); } - auto nnz() const {return nnz_;} - auto shape() const {return shape_;} - void fence() { - vals_backend_.fence(); - cols_backend_.fence(); - } -template - auto local_gemv(C &res, A &vals) const { - auto rank = cols_backend_.getrank(); - if (shape_[0] <= segment_size_ * rank) return; - auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); - if (dr::mp::use_sycl()) { - auto local_vals = vals_data_; - auto local_cols = cols_data_; - auto offset = val_offsets_[rank]; - auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]); - auto rows_data = dr::__detail::direct_iterator(dr::mp::local_segment(*rows_data_).begin()); - dr::mp::sycl_queue().submit([&](auto& cgh) { - cgh.parallel_for(sycl::range<1> { size }, - [=](auto idx) { - std::size_t lower_bound = 0; - T sum = 0; - if (rows_data[idx] > offset) { - lower_bound = rows_data[idx] - offset; - } - std::size_t upper_bound = real_segment_size; - if (idx < size - 1) { - upper_bound = rows_data[idx + 1] - offset; - } - for (auto i = lower_bound; i < upper_bound; i++) { - auto colNum = local_cols[i]; - auto matrixVal = vals[colNum]; - auto vectorVal = local_vals[i]; - sum += matrixVal * vectorVal; - } - *(res + idx) += sum; - }); - }).wait(); - } - else { - auto local_rows = dr::mp::local_segment(*rows_data_); - auto val_count = val_sizes_[rank]; - auto row_i = 0; - auto position = val_offsets_[rank]; - auto current_row_position = local_rows[1]; + } + std::size_t get_id_in_segment(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = + std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset) - 1; + return offset - *pos_iter; + } + std::size_t get_segment_from_offset(std::size_t offset) const { + assert(offset < nnz_); + auto pos_iter = + std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset); + return rng::distance(val_offsets_.begin(), pos_iter) - 1; + } + auto segments() const { return rng::views::all(segments_); } + auto nnz() const { return nnz_; } + auto shape() const { return shape_; } + void fence() { + vals_backend_.fence(); + cols_backend_.fence(); + } + template auto local_gemv(C &res, A &vals) const { + auto rank = cols_backend_.getrank(); + if (shape_[0] <= segment_size_ * rank) + return; + auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); + if (dr::mp::use_sycl()) { + auto local_vals = vals_data_; + auto local_cols = cols_data_; + auto offset = val_offsets_[rank]; + auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]); + auto rows_data = dr::__detail::direct_iterator( + dr::mp::local_segment(*rows_data_).begin()); + dr::mp::sycl_queue() + .submit([&](auto &cgh) { + cgh.parallel_for(sycl::range<1>{size}, [=](auto idx) { + std::size_t lower_bound = 0; + T sum = 0; + if (rows_data[idx] > offset) { + lower_bound = rows_data[idx] - offset; + } + std::size_t upper_bound = real_segment_size; + if (idx < size - 1) { + upper_bound = rows_data[idx + 1] - offset; + } + for (auto i = lower_bound; i < upper_bound; i++) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; + } + *(res + idx) += sum; + }); + }) + .wait(); + } else { + auto local_rows = dr::mp::local_segment(*rows_data_); + auto val_count = val_sizes_[rank]; + auto row_i = 0; + auto position = val_offsets_[rank]; + auto current_row_position = local_rows[1]; - for (int i = 0; i < val_count; i++) { - while (row_i + 1 < size && position + i >= current_row_position) { - row_i++; - current_row_position = local_rows[row_i + 1]; - } - res[row_i] += vals_data_[i] * vals[cols_data_[i]]; + for (int i = 0; i < val_count; i++) { + while (row_i + 1 < size && position + i >= current_row_position) { + row_i++; + current_row_position = local_rows[row_i + 1]; } + res[row_i] += vals_data_[i] * vals[cols_data_[i]]; } } - - template - auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { - assert(res.size() == shape_.first); - __detail::allocator alloc; - auto res_alloc = alloc.allocate(segment_size_); - for (auto i = 0; i < segment_size_; i++) { - res_alloc[i] = 0; - } - - // auto begin = std::chrono::high_resolution_clock::now(); - local_gemv(res_alloc, vals); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("rows gemv time {}\n", duration * 1000); + } - gather_gemv_vector(root, res, res_alloc); - alloc.deallocate(res_alloc, segment_size_); + template + auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + assert(res.size() == shape_.first); + __detail::allocator alloc; + auto res_alloc = alloc.allocate(segment_size_); + for (auto i = 0; i < segment_size_; i++) { + res_alloc[i] = 0; } + + // auto begin = std::chrono::high_resolution_clock::now(); + local_gemv(res_alloc, vals); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // fmt::print("rows gemv time {}\n", duration * 1000); + + gather_gemv_vector(root, res, res_alloc); + alloc.deallocate(res_alloc, segment_size_); + } + private: friend csr_row_segment_iterator; - - template + + template void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { - auto communicator = default_comm(); - __detail::allocator alloc; - if (communicator.rank() == root) { - auto scratch = alloc.allocate(segment_size_ * default_comm().size()); - communicator.gather(partial_res, scratch, segment_size_, root); - std::copy(scratch, scratch + shape_.first, res.begin()); - alloc.deallocate(scratch, segment_size_ * communicator.size()); - } - else { - communicator.gather(partial_res, static_cast(nullptr), segment_size_, root); - } + auto communicator = default_comm(); + __detail::allocator alloc; + if (communicator.rank() == root) { + auto scratch = alloc.allocate(segment_size_ * default_comm().size()); + communicator.gather(partial_res, scratch, segment_size_, root); + std::copy(scratch, scratch + shape_.first, res.begin()); + alloc.deallocate(scratch, segment_size_ * communicator.size()); + } else { + communicator.gather(partial_res, static_cast(nullptr), segment_size_, + root); + } } - void init(dr::views::csr_matrix_view csr_view, auto dist, std::size_t root) { + void init(dr::views::csr_matrix_view csr_view, auto dist, + std::size_t root) { distribution_ = dist; auto rank = vals_backend_.getrank(); @@ -149,8 +156,7 @@ template initial_data[1] = csr_view.shape().first; initial_data[2] = csr_view.shape().second; default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); - } - else { + } else { default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root); } @@ -159,10 +165,13 @@ template rows_data_ = std::make_shared>(shape_.first); - dr::mp::copy(root, std::ranges::subrange(csr_view.rowptr_data(), csr_view.rowptr_data() + shape_.first), rows_data_->begin()); - + dr::mp::copy(root, + std::ranges::subrange(csr_view.rowptr_data(), + csr_view.rowptr_data() + shape_.first), + rows_data_->begin()); + auto row_info_size = default_comm().size() * 2; - std::size_t* val_information = new std::size_t[row_info_size]; + std::size_t *val_information = new std::size_t[row_info_size]; val_offsets_.reserve(row_info_size); val_sizes_.reserve(row_info_size); if (rank == root) { @@ -184,10 +193,11 @@ template val_information[i] = lower_limit; val_information[i + default_comm().size()] = higher_limit - lower_limit; } - default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, root); - } - else { - default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, root); + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, + root); + } else { + default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size, + root); for (int i = 0; i < default_comm().size(); i++) { val_offsets_.push_back(val_information[i]); val_sizes_.push_back(val_information[default_comm().size() + i]); @@ -195,10 +205,13 @@ template } delete[] val_information; vals_size_ = std::max(val_sizes_[rank], static_cast(1)); - // fmt::print("dfsa {} {} {} {}\n", vals_size_, val_sizes_[rank],lower_limit, rank); - - cols_data_ = static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); - vals_data_ = static_cast(vals_backend_.allocate(vals_size_ * sizeof(T))); + // fmt::print("dfsa {} {} {} {}\n", vals_size_, + // val_sizes_[rank],lower_limit, rank); + + cols_data_ = + static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); + vals_data_ = + static_cast(vals_backend_.allocate(vals_size_ * sizeof(T))); fence(); if (rank == root) { @@ -206,8 +219,10 @@ template auto lower_limit = val_offsets_[i]; auto row_size = val_sizes_[i]; if (row_size > 0) { - vals_backend_.putmem(csr_view.values_data() + lower_limit, 0, row_size * sizeof(T), i); - cols_backend_.putmem(csr_view.colind_data() + lower_limit, 0, row_size * sizeof(I), i); + vals_backend_.putmem(csr_view.values_data() + lower_limit, 0, + row_size * sizeof(T), i); + cols_backend_.putmem(csr_view.colind_data() + lower_limit, 0, + row_size * sizeof(I), i); } } } @@ -215,8 +230,11 @@ template std::size_t segment_index = 0; segment_size_ = rows_data_->segment_size(); for (std::size_t i = 0; i < default_comm().size(); i++) { - //TODO fix segment creation, to include proper sizes, basing on val_offsets; - segments_.emplace_back(this, segment_index++, val_sizes_[i], std::max(val_sizes_[i], static_cast(1))); + // TODO fix segment creation, to include proper sizes, basing on + // val_offsets; + segments_.emplace_back( + this, segment_index++, val_sizes_[i], + std::max(val_sizes_[i], static_cast(1))); } // if (rank == 0) { // int ax = 0; @@ -227,29 +245,30 @@ template // fmt::print("{} {}\n", i, get_segment_from_offset(i)); // } // } - // fmt::print(" {} {} {} {}\n",get_segment_from_offset(47), get_segment_from_offset(48), get_segment_from_offset(49), get_segment_from_offset(50)); - // for (int i = 0; i < vals_size_; i++) { - // fmt::print("col, val, i, rank {} {} {} {}\n", cols_data_[i], vals_data_[i], i, rank); + // fmt::print(" {} {} {} {}\n",get_segment_from_offset(47), + // get_segment_from_offset(48), get_segment_from_offset(49), + // get_segment_from_offset(50)); for (int i = 0; i < vals_size_; i++) { + // fmt::print("col, val, i, rank {} {} {} {}\n", cols_data_[i], + // vals_data_[i], i, rank); // } // fence(); // if (rank < rows_data_->segments().size()) { // for (int i = 0; i < rows_data_->segments()[rank].size(); i++) { - // fmt::print("row, i, rank {} {} {}\n", rows_data_->segments()[rank][i], i, rank); + // fmt::print("row, i, rank {} {} {}\n", + // rows_data_->segments()[rank][i], i, rank); // } // } fence(); } - std::size_t segment_size_ = 0; std::size_t vals_size_ = 0; std::vector val_offsets_; std::vector val_sizes_; - index_type *cols_data_ = nullptr; BackendT cols_backend_; - + elem_type *vals_data_ = nullptr; BackendT vals_backend_; @@ -259,4 +278,4 @@ template std::vector> segments_; std::shared_ptr> rows_data_; }; -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index f28abd23d3..f706898667 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -13,8 +13,8 @@ template class csr_row_segment_reference { public: using value_type = typename DSM::value_type; - using index_type = typename DSM::index_type; - using elem_type = typename DSM::elem_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; csr_row_segment_reference(const iterator it) : iterator_(it) {} @@ -45,12 +45,13 @@ template class csr_row_segment_reference { template class csr_row_segment_iterator { public: using value_type = typename DSM::value_type; - using index_type = typename DSM::index_type; - using elem_type = typename DSM::elem_type; + using index_type = typename DSM::index_type; + using elem_type = typename DSM::elem_type; using difference_type = typename DSM::difference_type; csr_row_segment_iterator() = default; - csr_row_segment_iterator(DSM *dsm, std::size_t segment_index, std::size_t index) { + csr_row_segment_iterator(DSM *dsm, std::size_t segment_index, + std::size_t index) { dsm_ = dsm; segment_index_ = segment_index; index_ = index; @@ -81,7 +82,8 @@ template class csr_row_segment_iterator { auto &operator-=(difference_type n) { return *this += (-n); } - difference_type operator-(const csr_row_segment_iterator &other) const noexcept { + difference_type + operator-(const csr_row_segment_iterator &other) const noexcept { assert(dsm_ != nullptr && dsm_ == other.dsm_); assert(index_ >= other.index_); return index_ - other.index_; @@ -121,7 +123,8 @@ template class csr_row_segment_iterator { } // When *this is not first in the expression - friend auto operator+(difference_type n, const csr_row_segment_iterator &other) { + friend auto operator+(difference_type n, + const csr_row_segment_iterator &other) { return other + n; } @@ -154,9 +157,10 @@ template class csr_row_segment_iterator { void get_value(elem_type *dst, std::size_t size) const { assert(dsm_ != nullptr); assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_); - dsm_->vals_backend_.getmem(dst, index_ * sizeof(elem_type), size * sizeof(elem_type), segment_index_); + dsm_->vals_backend_.getmem(dst, index_ * sizeof(elem_type), + size * sizeof(elem_type), segment_index_); } - + elem_type get_value() const { elem_type val; get_value(&val, 1); @@ -169,25 +173,27 @@ template class csr_row_segment_iterator { index_type *col_data; if (rank() == dsm_->cols_backend_.getrank()) { col_data = dsm_->cols_data_ + index_; - } - else { + } else { col_data = new index_type[size]; - dsm_->cols_backend_.getmem(col_data, index_ * sizeof(index_type), size * sizeof(index_type), segment_index_); + dsm_->cols_backend_.getmem(col_data, index_ * sizeof(index_type), + size * sizeof(index_type), segment_index_); } index_type *rows; std::size_t rows_length = dsm_->segment_size_; rows = new index_type[rows_length]; - (dsm_->rows_data_->segments()[segment_index_].begin()).get(rows, rows_length); - + (dsm_->rows_data_->segments()[segment_index_].begin()) + .get(rows, rows_length); + auto position = dsm_->val_offsets_[segment_index_] + index_; auto rows_iter = rows + 1; index_type *cols_iter = col_data; auto iter = dst; std::size_t current_row = dsm_->segment_size_ * segment_index_; - std::size_t last_row = std::min(current_row + rows_length - 1, dsm_->shape_[0] - 1); + std::size_t last_row = + std::min(current_row + rows_length - 1, dsm_->shape_[0] - 1); for (int i = 0; i < size; i++) { - while (current_row < last_row && *rows_iter <= position + i ) { + while (current_row < last_row && *rows_iter <= position + i) { rows_iter++; current_row++; } @@ -200,7 +206,6 @@ template class csr_row_segment_iterator { delete[] col_data; } delete[] rows; - } dr::index get_index() const { @@ -216,7 +221,8 @@ template class csr_row_segment_iterator { auto segments() const { assert(dsm_ != nullptr); - return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); + return dr::__detail::drop_segments(dsm_->segments(), segment_index_, + index_); } private: @@ -235,7 +241,7 @@ template class csr_row_segment { using difference_type = std::ptrdiff_t; csr_row_segment() = default; csr_row_segment(DSM *dsm, std::size_t segment_index, std::size_t size, - std::size_t reserved) { + std::size_t reserved) { dsm_ = dsm; segment_index_ = segment_index; size_ = size; @@ -266,11 +272,13 @@ template class csr_row_segment { } // namespace dr::mp namespace std { - template - struct tuple_size> : std::integral_constant {}; - - template - struct tuple_element> - : tuple_element, typename DSM::elem_type>> {}; +template +struct tuple_size> + : std::integral_constant {}; + +template +struct tuple_element> + : tuple_element, + typename DSM::elem_type>> {}; } // namespace std diff --git a/include/dr/sp/algorithms/matrix/local_gemv.hpp b/include/dr/sp/algorithms/matrix/local_gemv.hpp index f8e272bbbd..7c3c32baa5 100644 --- a/include/dr/sp/algorithms/matrix/local_gemv.hpp +++ b/include/dr/sp/algorithms/matrix/local_gemv.hpp @@ -19,8 +19,9 @@ namespace __detail { template requires(std::is_same_v, T>) -auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { std::size_t wg = 32; auto event = q.submit([&](auto &&h) { @@ -55,7 +56,8 @@ auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view a, It template requires(std::is_same_v, T>) -auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, Iter c, +auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, const std::vector &dependencies = {}) { oneapi::mkl::sparse::matrix_handle_t a_handle; @@ -78,8 +80,9 @@ auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { return mkl_gemv(q, a, b, c, dependencies); } @@ -88,8 +91,9 @@ auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Ite template requires(std::is_same_v, T>) -auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, Iter b, - Iter c, const std::vector &dependencies = {}) { +auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view a, + Iter b, Iter c, + const std::vector &dependencies = {}) { return custom_gemv(q, a, b, c, dependencies); } diff --git a/include/dr/sp/containers/distributed_dense_matrix.hpp b/include/dr/sp/containers/distributed_dense_matrix.hpp index e316f2dad0..0c53534cbf 100644 --- a/include/dr/sp/containers/distributed_dense_matrix.hpp +++ b/include/dr/sp/containers/distributed_dense_matrix.hpp @@ -7,8 +7,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/include/dr/sp/containers/sparse_matrix.hpp b/include/dr/sp/containers/sparse_matrix.hpp index c9bfe90ee9..921a3aa8a9 100644 --- a/include/dr/sp/containers/sparse_matrix.hpp +++ b/include/dr/sp/containers/sparse_matrix.hpp @@ -4,15 +4,15 @@ #pragma once +#include #include -#include #include +#include #include #include #include #include #include -#include #include namespace dr::sp { diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp index 9cbb6b91eb..753ce1db36 100644 --- a/include/dr/sp/util/matrix_io.hpp +++ b/include/dr/sp/util/matrix_io.hpp @@ -12,8 +12,8 @@ #include #include -#include #include +#include namespace dr::sp { @@ -58,12 +58,11 @@ auto create_distributed(dr::views::csr_matrix_view local_mat, return a; } - template auto create_distributed(dr::views::csr_matrix_view local_mat) { - return create_distributed(local_mat, - dr::sp::block_cyclic({dr::sp::tile::div, dr::sp::tile::div}, - {dr::sp::nprocs(), 1})); + return create_distributed( + local_mat, dr::sp::block_cyclic({dr::sp::tile::div, dr::sp::tile::div}, + {dr::sp::nprocs(), 1})); } template diff --git a/include/dr/views/csr_matrix_view.hpp b/include/dr/views/csr_matrix_view.hpp index 65259bdf76..bd434ae03f 100644 --- a/include/dr/views/csr_matrix_view.hpp +++ b/include/dr/views/csr_matrix_view.hpp @@ -222,4 +222,4 @@ csr_matrix_view(TIter, IIter, IIter, Args &&...) -> csr_matrix_view, std::iter_value_t, TIter, IIter>; -} // namespace dr::view +} // namespace dr::views From bb2e02ee2629daece0c9645804b6801659f88bb1 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 17 Sep 2024 13:12:10 +0200 Subject: [PATCH 14/68] Add sparse benchmark and broadcasted vector --- examples/mp/CMakeLists.txt | 1 + examples/mp/sparse_benchmark.cpp | 125 ++++++++++++++++++ examples/mp/sparse_matrix.cpp | 88 ++++++------ include/dr/detail/generate_random_csr.hpp | 68 +++++++++- include/dr/detail/matrix_io.hpp | 72 +++++++--- include/dr/mp.hpp | 1 + include/dr/mp/algorithms/matrix/gemv.hpp | 27 ++-- .../dr/mp/containers/broadcasted_vector.hpp | 42 ++++++ .../containers/distributed_sparse_matrix.hpp | 6 +- .../matrix_formats/csr_eq_distribution.hpp | 13 +- .../matrix_formats/csr_row_distribution.hpp | 9 +- 11 files changed, 360 insertions(+), 92 deletions(-) create mode 100644 examples/mp/sparse_benchmark.cpp create mode 100644 include/dr/mp/containers/broadcasted_vector.hpp diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index f97a527bbb..c4af8c905c 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -27,6 +27,7 @@ add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) add_mp_example(sparse_matrix) +add_mp_example(sparse_benchmark) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/sparse_benchmark.cpp b/examples/mp/sparse_benchmark.cpp new file mode 100644 index 0000000000..2e49873619 --- /dev/null +++ b/examples/mp/sparse_benchmark.cpp @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include +#include +#include +#include + +namespace mp = dr::mp; + +MPI_Comm comm; +int comm_rank; +int comm_size; + +int main(int argc, char **argv) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + if (argc != 3 && argc != 5) { + fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market file], or ./sparse_benchmark [test outcome dir] [number of rows] [number of columns] [density]\n"); + return 1; + } + +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + mp::init(q); +#else + mp::init(); +#endif + dr::views::csr_matrix_view local_data; + std::stringstream filenamestream; + auto root = 0; + auto computeSize = dr::mp::default_comm().size(); + if (root == dr::mp::default_comm().rank()) { + if (argc == 5) { + fmt::print("started loading\n"); + auto n = std::stoul(argv[2]); + auto up = std::stoul(argv[3]); + auto down = std::stoul(argv[4]); + // local_data = dr::generate_random_csr({n, m}, density, 42); + local_data = dr::generate_band_csr(n, up, down); + filenamestream << "mp_band_" << computeSize << "_" << n << "_" << up + down << "_" << local_data.size(); + fmt::print("finished loading\n"); + } + else { + fmt::print("started loading\n"); + std::string fname(argv[2]); + std::filesystem::path p(argv[2]); + local_data = dr::read_csr(fname); + filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" << local_data.size(); + fmt::print("finished loading\n"); + } + } + std::string resname; +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m_eq(local_data, root); +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + fmt::print("finished distribution\n"); + std::vector eq_duration; + std::vector row_duration; + + auto N = 10; + std::vector b; + b.reserve(m_row.shape().second); + std::vector res(m_row.shape().first); + for (auto i = 0; i < m_row.shape().second; i++) { + b.push_back(i); + } + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m_row.shape().second, 0, b, dr::mp::default_comm()); + + fmt::print("started initial gemv distribution\n"); + gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work + + fmt::print("finished initial gemv distribution\n"); + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_eq, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + eq_duration.push_back(duration); + } + + gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_row, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + row_duration.push_back(duration); + } + + if (root == dr::mp::default_comm().rank()) { + std::string tmp; + filenamestream >> tmp; + std::filesystem::path p(argv[1]); + p += tmp; + p += ".csv"; + std::ofstream write_stream(p.string()); + write_stream << eq_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << eq_duration[i]; + } + write_stream << "\n"; + write_stream << row_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << row_duration[i]; + } + write_stream << "\n"; + } + allocated_b.destroy_data(); + mp::finalize(); +} \ No newline at end of file diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index cb42afc823..e3163212d4 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -3,15 +3,14 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include #include namespace mp = dr::mp; int main(int argc, char **argv) { - + if (argc != 2) { - fmt::print("usage: ./gemv_benchmark [matrix market file]\n"); + fmt::print("usage: ./sparse_matrix [matrix market file]\n"); return 1; } @@ -24,14 +23,18 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; - if (root == dr::mp::default_comm().rank()) { - local_data = dr::read_csr(fname); - } + // if (root == dr::mp::default_comm().rank()) { + local_data = dr::read_csr(fname); + // } { - fmt::print("started\n"); - mp::distributed_sparse_matrix> m(local_data, root); - fmt::print("hihihih\n"); - mp::distributed_sparse_matrix> m_row(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); fmt::print("{}\n", m.size()); // for (int i = 0; i < dr::mp::default_comm().size(); i++) { // if (dr::mp::default_comm().rank() == i) { @@ -40,12 +43,13 @@ int main(int argc, char **argv) { // // fmt::print("{}\n", i); // for (auto [index, val]: m) { // auto [m, n] = index; - + // auto [index_csr, val_csr] = *csr_iter; // auto [m_csr, n_csr] = index_csr; // auto check = m == m_csr && n_csr == n && val == val_csr; // if (!check) { - // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, val_csr); + // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, + // val_csr); // } // // assert(check); // csr_iter++; @@ -61,57 +65,67 @@ int main(int argc, char **argv) { for (int i = 0; i < a.size(); i++) { a[i] = i; } + + + dr::mp::broadcasted_vector allocated_a; + allocated_a.broadcast_data(m_row.shape().second, 0, a, dr::mp::default_comm()); m.fence(); double total_time = 0; - auto N = 10; - gemv(0, res, m, a); // it is here to prepare sycl for work + auto N = 1; + gemv(0, res, m, allocated_a); // it is here to prepare sycl for work for (int i = 0; i < N; i++) { auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m, a); + gemv(0, res, m, allocated_a); auto end = std::chrono::high_resolution_clock::now(); double duration = std::chrono::duration(end - begin).count(); total_time += duration; if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { - fmt::print("eq canary {}\n", duration); + fmt::print("eq canary {}\n", duration * 1000); } } - fmt::print("eq gemv time {}\n", total_time * 1000 / N); + if (root == dr::mp::default_comm().rank()) { + fmt::print("eq gemv time total {}\n", total_time * 1000 / N); + } m.fence(); total_time = 0; - gemv(0, res_row, m_row, a); + gemv(0, res_row, m_row, allocated_a); for (int i = 0; i < N; i++) { auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res_row, m_row, a); + gemv(0, res_row, m_row, allocated_a); auto end = std::chrono::high_resolution_clock::now(); double duration = std::chrono::duration(end - begin).count(); total_time += duration; if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { - fmt::print("row canary {}\n", duration); + fmt::print("row canary {}\n", duration * 1000); } } - fmt::print("row gemv time {}\n", total_time * 1000 / N); + + if (root == dr::mp::default_comm().rank()) { + fmt::print("row gemv time total {}\n", total_time * 1000 / N); + } m_row.fence(); std::vector ref(m.shape().first); if (dr::mp::default_comm().rank() == 0) { - for (auto a : local_data) { - auto [index, val] = a; - auto [m, n] = index; - ref[m] += n * val; - } - for (int i = 0; i < m.shape().first; i++) { - if (res[i] != ref[i]) { - fmt::print("mismatching outcome {} {}\n", res[i], ref[i]); - } - } - for (int i = 0; i < m.shape().first; i++) { - if (res_row[i] != ref[i]) { - fmt::print("mismatching outcome row {} {}\n", res_row[i], ref[i]); - } - } + for (auto a : local_data) { + auto [index, val] = a; + auto [m, n] = index; + ref[m] += n * val; + } + for (int i = 0; i < m.shape().first; i++) { + if (res[i] != ref[i]) { + fmt::print("mismatching outcome {} {}\n", res[i], ref[i]); + } + } + for (int i = 0; i < m.shape().first; i++) { + if (res_row[i] != ref[i]) { + fmt::print("mismatching outcome row {} {}\n", res_row[i], ref[i]); + } + } } + allocated_a.destroy_data(); } - + if (root == dr::mp::default_comm().rank()) { dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); } diff --git a/include/dr/detail/generate_random_csr.hpp b/include/dr/detail/generate_random_csr.hpp index fc84dd1263..e99afdaf7a 100644 --- a/include/dr/detail/generate_random_csr.hpp +++ b/include/dr/detail/generate_random_csr.hpp @@ -6,8 +6,9 @@ #include #include -#include +#include #include +#include namespace dr { @@ -24,17 +25,26 @@ template struct uniform_distribution { template using uniform_distribution_t = typename uniform_distribution::type; +struct pair_hash { + template + inline std::size_t operator()(const std::pair & v) const { + return v.first*31+v.second; + } +}; + } // namespace + template auto generate_random_csr(dr::index shape, double density = 0.01, unsigned int seed = 0) { assert(density >= 0.0 && density < 1.0); - std::map, T> tuples; - + std::unordered_set, pair_hash> tuples{}; + std::vector, T>> entries; std::size_t nnz = density * shape[0] * shape[1]; + entries.reserve(nnz); std::mt19937 gen(seed); std::uniform_int_distribution row(0, shape[0] - 1); @@ -47,10 +57,10 @@ auto generate_random_csr(dr::index shape, double density = 0.01, auto j = column(gen); if (tuples.find({i, j}) == tuples.end()) { T value = value_gen(gen); - tuples.insert({{i, j}, value}); + tuples.insert({i, j}); + entries.push_back({{i, j}, value}); } } - T *values = new T[nnz]; I *rowptr = new I[shape[0] + 1]; I *colind = new I[nnz]; @@ -59,7 +69,8 @@ auto generate_random_csr(dr::index shape, double density = 0.01, std::size_t r = 0; std::size_t c = 0; - for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { + std::sort(entries.begin(), entries.end()); + for (auto iter = entries.begin(); iter != entries.end(); ++iter) { auto &&[index, value] = *iter; auto &&[i, j] = index; @@ -89,4 +100,49 @@ auto generate_random_csr(dr::index shape, double density = 0.01, return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0); } +template +auto generate_band_csr(I size, std::size_t up_band = 3, + std::size_t down_band = 3) { + std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band - 1) / 2) - (down_band * (down_band - 1) / 2); + + T *values = new T[nnz]; + I *rowptr = new I[size + 1]; + I *colind = new I[nnz]; + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto i = 0; i < size; i++) { + for (auto j = i - down_band; j < i ; j++) { + if (j < 0) { + continue; + } + values[c] = 1; + colind[c] = j; + c++; + } + values[c] = 1; + colind[c] = i; + c++; + for (auto j = i + 1; j <= i + up_band ; j++) { + if (j >= size) { + continue; + } + values[c] = 1; + colind[c] = j; + c++; + } + rowptr[r + 1] = c + 1; + r++; + + } + + for (; r < size; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, {size, size}, nnz, 0); +} + } // namespace dr diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index eedb0d93b9..8dedfc44b2 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -25,7 +24,55 @@ namespace __detail { // 2) `tuples` has shape `shape` // 3) `tuples` has `nnz` elements template -auto convert_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, +auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, + Allocator &&allocator) { + auto &&[index, v] = *tuples.begin(); + auto &&[i, j] = index; + + using T = std::remove_reference_t; + using I = std::remove_reference_t; + + typename std::allocator_traits::template rebind_alloc + i_allocator(allocator); + + T *values = allocator.allocate(nnz); + I *rowptr = i_allocator.allocate(shape[0] + 1); + I *colind = i_allocator.allocate(nnz); + + rowptr[0] = 0; + + std::size_t r = 0; + std::size_t c = 0; + for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { + auto &&[index, value] = *iter; + + auto &&[i, j] = index; + + values[c] = value; + colind[c] = j; + + while (r < i) { + assert(r + 1 <= shape[0]); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + rowptr[r + 1] = c; + r++; + } + c++; + + assert(c <= nnz); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + } + + for (; r < shape[0]; r++) { + rowptr[r + 1] = nnz; + } + + return dr::views::csr_matrix_view(values, rowptr, colind, + dr::index(shape[0], shape[1]), nnz, 0); +} + +template +auto convert_local_csr_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, Allocator &&allocator) { auto &&[v, j] = *csr_matrix.begin()->begin(); @@ -74,7 +121,6 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, bool one_indexed = true) { using size_type = std::size_t; - auto begin = std::chrono::high_resolution_clock::now(); std::ifstream f; f.open(file_path.c_str()); @@ -180,15 +226,7 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, } } - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count(); - fmt::print("No sort read time {}\n", duration * 1000); - - begin = std::chrono::high_resolution_clock::now(); matrix.sort(); - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration(end - begin).count(); - fmt::print("Sort time {}\n", duration * 1000); f.close(); return matrix; @@ -208,20 +246,10 @@ void destroy_csr_matrix_view(dr::views::csr_matrix_view view, template auto read_csr(std::string file_path, bool one_indexed = true) { - auto begin = std::chrono::high_resolution_clock::now(); auto m = __detail::read_coo_matrix(file_path, one_indexed); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count(); - fmt::print("Read time {}\n", duration * 1000); - auto shape = m.shape(); auto nnz = m.size(); - - begin = std::chrono::high_resolution_clock::now(); - auto t = __detail::convert_to_csr(m, shape, nnz, std::allocator{}); - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration(end - begin).count(); - fmt::print("Conversion time {}\n", duration * 1000); + auto t = __detail::convert_local_csr_to_csr(m, shape, nnz, std::allocator{}); return t; } diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index 477b6d3984..490fa182d7 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -83,4 +83,5 @@ #include #include #include +#include #include diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index eb152adac7..7c9befcc58 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -8,28 +8,31 @@ #include #include #include +#include + namespace dr::mp { -template C, rng::input_range B, +template C, typename Alloc, typename Backend, typename MatDistr> requires(vector_multiplicable) void gemv(int root, C &res, - distributed_sparse_matrix &a, B &b) { + distributed_sparse_matrix &a, broadcasted_vector b) { if (default_comm().rank() == root) { assert(a.shape().first == res.size()); - assert(a.shape().second == b.size()); } // copy b to all machines - auto communicator = default_comm(); - __detail::allocator alloc; - auto broadcasted_b = alloc.allocate(a.shape().second); - if (communicator.rank() == root) { - rng::copy(b.begin(), b.end(), broadcasted_b); - } - communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); - a.local_gemv_and_collect(root, res, broadcasted_b); - alloc.deallocate(broadcasted_b, a.shape().second); + // auto communicator = default_comm(); + // __detail::allocator alloc; + // auto broadcasted_b = alloc.allocate(a.shape().second); + // if (communicator.rank() == root) { + // rng::copy(b.begin(), b.end(), broadcasted_b); + // } + + // communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); + a.local_gemv_and_collect(root, res, b.broadcasted_data()); + + // alloc.deallocate(broadcasted_b, a.shape().second); // a.fence(); // if (default_comm().rank() == root) { // for (int i = 0; i < a.shape().first; i++) { diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp new file mode 100644 index 0000000000..cbcc061cbb --- /dev/null +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + + +template > +class broadcasted_vector { + public: + broadcasted_vector() = default; + + template + void broadcast_data(std::size_t data_size, std::size_t root, R root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = data_size; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + rng::copy(root_data.begin(), root_data.end(), _data); + } + comm.bcast(_data, sizeof(T) * _data_size, root); + } + + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T* broadcasted_data() { + return _data; + } + private: + T* _data = nullptr; + std::size_t _data_size = 0; + Allocator alloc; +}; +} \ No newline at end of file diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 9e2b22aefc..89b8ceb367 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -22,7 +22,7 @@ concept matrix_distibution = requires(T t, std::vector res, int *input) { }; template -concept vector_multiplicable = requires(T t, std::vector res, int *input) { +concept vector_multiplicable = requires(T t, std::vector res, T::elem_type *input) { t.local_gemv_and_collect(int(), res, input); }; @@ -150,9 +150,9 @@ class distributed_sparse_matrix { void fence() { distribution_.fence(); } - template + template requires(vector_multiplicable) - auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + auto local_gemv_and_collect(std::size_t root, C &res, T* vals) const { distribution_.local_gemv_and_collect(root, res, vals); } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 7968ff30f5..69a2f789ed 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -48,7 +48,7 @@ class csr_eq_distribution { auto shape() const { return shape_; } void fence() { rows_backend_.fence(); } - template auto local_gemv(C &res, A &vals) const { + template auto local_gemv(C &res, T* vals) const { auto rank = rows_backend_.getrank(); if (nnz_ <= segment_size_ * rank) { return; @@ -140,20 +140,17 @@ class csr_eq_distribution { } } - template - auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + template + auto local_gemv_and_collect(std::size_t root, C &res, T* vals) const { assert(res.size() == shape_.first); __detail::allocator alloc; auto res_alloc = alloc.allocate(max_row_size_); for (auto i = 0; i < max_row_size_; i++) { res_alloc[i] = 0; } - // auto begin = std::chrono::high_resolution_clock::now(); + local_gemv(res_alloc, vals); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("eq gemv time {}\n", duration * 1000); - + gather_gemv_vector(root, res, res_alloc); alloc.deallocate(res_alloc, max_row_size_); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 209ceafe19..aa45f24e91 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -58,7 +58,7 @@ class csr_row_distribution { vals_backend_.fence(); cols_backend_.fence(); } - template auto local_gemv(C &res, A &vals) const { + template auto local_gemv(C &res, T* vals) const { auto rank = cols_backend_.getrank(); if (shape_[0] <= segment_size_ * rank) return; @@ -109,8 +109,8 @@ class csr_row_distribution { } } - template - auto local_gemv_and_collect(std::size_t root, C &res, A &vals) const { + template + auto local_gemv_and_collect(std::size_t root, C &res, T* &vals) const { assert(res.size() == shape_.first); __detail::allocator alloc; auto res_alloc = alloc.allocate(segment_size_); @@ -122,7 +122,8 @@ class csr_row_distribution { local_gemv(res_alloc, vals); // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("rows gemv time {}\n", duration * 1000); + // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); + // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); gather_gemv_vector(root, res, res_alloc); alloc.deallocate(res_alloc, segment_size_); From 18165daae73a2012b0501c9c34c5cc850939a35e Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 17 Sep 2024 13:12:48 +0200 Subject: [PATCH 15/68] Add benchmarking tools --- examples/sp/CMakeLists.txt | 2 +- examples/sp/gemv_benchmark.cpp | 101 ------------------------ format_res.py | 140 +++++++++++++++++++++++++++++++++ run_benchmarks.sh | 32 ++++++++ 4 files changed, 173 insertions(+), 102 deletions(-) create mode 100644 format_res.py create mode 100755 run_benchmarks.sh diff --git a/examples/sp/CMakeLists.txt b/examples/sp/CMakeLists.txt index c078cbf5e8..ce82e75626 100644 --- a/examples/sp/CMakeLists.txt +++ b/examples/sp/CMakeLists.txt @@ -18,7 +18,7 @@ add_sp_example(sort) add_sp_example_no_test(sort_benchmark) add_sp_example(inclusive_scan_example) add_sp_example(exclusive_scan_example) -# unsatisfied dependency of grb/grb.hpp add_sp_example(gemv_benchmark) +add_sp_example(gemv_benchmark) add_sp_example_no_test(dot_product_benchmark) add_sp_example_no_test(inclusive_scan_benchmark) add_sp_example_no_test(exclusive_scan_benchmark) diff --git a/examples/sp/gemv_benchmark.cpp b/examples/sp/gemv_benchmark.cpp index c29182dd60..850f2f87c0 100644 --- a/examples/sp/gemv_benchmark.cpp +++ b/examples/sp/gemv_benchmark.cpp @@ -7,25 +7,11 @@ #include // FIXME: what is grb.hpp? add it to cmake or remove this dependency -#include #include namespace sp = dr::sp; -template auto local_gemv(M &&a) { - using T = grb::matrix_scalar_t; - std::vector b(a.shape()[1], 1); - std::vector c(a.shape()[0], 0); - - for (auto &&[index, v] : a) { - auto &&[i, k] = index; - c[i] += v * b[k]; - } - - return c; -} - template bool is_equal(T &&x, U &&y) { return x == y; } template @@ -78,8 +64,6 @@ int main(int argc, char **argv) { fmt::print("Square {} x {}\n", a_square.grid_shape()[0], a_square.grid_shape()[1]); - auto c_local = local_gemv(grb::matrix(fname)); - std::size_t m = a.shape()[0]; std::size_t k = a.shape()[1]; @@ -105,13 +89,6 @@ int main(int argc, char **argv) { fmt::print("Copying...\n"); std::vector l(c.size()); dr::sp::copy(c.begin(), c.end(), l.begin()); - fmt::print("Verifying...\n"); - for (std::size_t i = 0; i < l.size(); i++) { - if (!is_equal(l[i], c_local[i])) { - fmt::print("{} != {}\n", l[i], c_local[i]); - } - } - assert(is_equal(c_local, l)); fmt::print("Benchmarking...\n"); for (std::size_t i = 0; i < n_iterations; i++) { @@ -147,13 +124,6 @@ int main(int argc, char **argv) { sp::gemv_square(c, a_square, b); std::vector l(c.size()); sp::copy(c.begin(), c.end(), l.begin()); - for (std::size_t i = 0; i < l.size(); i++) { - if (!is_equal(l[i], c_local[i])) { - // fmt::print("{} != {}\n", l[i], c_local[i]); - } - } - assert(is_equal(c_local, l)); - for (std::size_t i = 0; i < n_iterations; i++) { auto begin = std::chrono::high_resolution_clock::now(); sp::gemv_square(c, a_square, b); @@ -189,12 +159,6 @@ int main(int argc, char **argv) { sp::gemv_square_copy(c, a_square, b); std::vector l(c.size()); sp::copy(c.begin(), c.end(), l.begin()); - for (std::size_t i = 0; i < l.size(); i++) { - if (!is_equal(l[i], c_local[i])) { - fmt::print("{} != {}\n", l[i], c_local[i]); - } - } - assert(is_equal(c_local, l)); for (std::size_t i = 0; i < n_iterations; i++) { auto begin = std::chrono::high_resolution_clock::now(); @@ -225,71 +189,6 @@ int main(int argc, char **argv) { durations.clear(); } - { - auto m = sp::__detail::mmread(fname); - auto shape = m.shape(); - auto nnz = m.size(); - - auto local_mat = - sp::__detail::convert_to_csr(m, shape, nnz, std::allocator{}); - - sycl::queue q(sp::context(), sp::devices()[0]); - - T *values = sycl::malloc_device(nnz, q); - I *colind = sycl::malloc_device(nnz, q); - I *rowptr = sycl::malloc_device(local_mat.shape()[0] + 1, q); - - q.memcpy(values, local_mat.values_data(), sizeof(T) * nnz).wait(); - q.memcpy(colind, local_mat.colind_data(), sizeof(T) * nnz).wait(); - q.memcpy(rowptr, local_mat.rowptr_data(), - sizeof(T) * (local_mat.shape()[0] + 1)) - .wait(); - - sp::device_allocator allocator(q); - - sp::vector> x(local_mat.shape()[1], 1, - allocator); - sp::vector> y(local_mat.shape()[1], 0, - allocator); - - dr::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); - - sp::csr_matrix_view a_view(values, rowptr, colind, shape, nnz, 0); - - auto e = sp::__detail::local_gemv(q, a_view, x.data().get_raw_pointer(), - y.data().get_raw_pointer()); - e.wait(); - - for (std::size_t i = 0; i < n_iterations; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - auto e = sp::__detail::local_gemv(q, a_view, x.data().get_raw_pointer(), - y.data().get_raw_pointer()); - e.wait(); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count(); - durations.push_back(duration); - } - - fmt::print("Durations: {}\n", - durations | - rng::views::transform([](auto &&x) { return x * 1000; })); - - std::sort(durations.begin(), durations.end()); - - double median_duration = durations[durations.size() / 2]; - - std::cout << "Single GPU: " << median_duration * 1000 << " ms" << std::endl; - - std::size_t n_bytes = sizeof(T) * a.size() + - sizeof(I) * (a.size() + a.shape()[0] + 1) // size of A - + sizeof(T) * b.size() // size of B - + sizeof(T) * c.size(); // size of C - double n_gbytes = n_bytes * 1e-9; - fmt::print("{} GB/s\n", n_gbytes / median_duration); - - durations.clear(); - } - fmt::print("Finalize...\n"); sp::finalize(); diff --git a/format_res.py b/format_res.py new file mode 100644 index 0000000000..4f7deed096 --- /dev/null +++ b/format_res.py @@ -0,0 +1,140 @@ +import os +import re +import numpy as np +from matplotlib import pyplot as plt +from functools import cmp_to_key + +rootdir = "./dest/" +res_dir = "./res/" +rand_regex = re.compile('mp_band_.+_.+_.+_.+\\.csv') +file_regex = re.compile('mp_.+_.+_.+\\.csv') + +strong_data = {} +weak_data = {} +for root, dirs, files in os.walk(rootdir): + for file in files: + entry_count = -1 + mpi_size = -1 + name = "" + if rand_regex.match(file): + tmp = file[0:-4] + res = tmp.split("_") + entry_count = res[-3] + mpi_size = res[-4] + name = "band" + elif (file_regex.match(file)): + tmp = file[0:-4] + res = tmp.split("_") + entry_count = res[-1] + mpi_size = res[-2] + name = res[-3] + if entry_count != -1: + with open(rootdir + file) as handle: + eq_res = handle.readline().split(",") + row_res = handle.readline().split(",") + eq_arr = np.array(eq_res).astype(np.float64) + row_arr = np.array(row_res).astype(np.float64) + ratio = round(float(entry_count) / float(mpi_size)) + if (ratio not in weak_data): + weak_data[ratio] = [] + if ((name, entry_count) not in strong_data): + strong_data[(name, entry_count)] = [] + strong_data[(name, entry_count)].append((mpi_size, eq_arr, row_arr)) + weak_data[ratio].append((mpi_size, entry_count, eq_arr, row_arr)) + +for entry in strong_data.items(): + if (len(entry[1]) == 1): + continue + + sorted_list = entry[1] + sorted_list = sorted(sorted_list, key=cmp_to_key(lambda x, y: int(x[0]) - int(y[0]))) + + base_eq = np.mean(sorted_list[0][1]) / int(sorted_list[0][0]) + base_row = np.mean(sorted_list[0][2]) / int(sorted_list[0][0]) + + index = [] + means_eq = [] + variance_eq = [] + means_row = [] + variance_row = [] + for info in sorted_list: + index.append(int(info[0])) + speedup_eq = base_eq / info[1] + speedup_row = base_row / info[2] + means_eq.append(np.mean(speedup_eq)) + variance_eq.append(np.var(speedup_eq)) + means_row.append(np.mean(speedup_row)) + variance_row.append(np.var(speedup_row)) + index = np.array(index) + means_eq = np.array(means_eq) + variance_eq = np.array(variance_eq) + means_row = np.array(means_row) + variance_row = np.array(variance_row) + fig, ax = plt.subplots() + + ax.fill_between(index, means_eq - variance_eq, means_eq + variance_eq, alpha=.5, linewidth=0) + ax.plot(index, means_eq, linewidth=2) + ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), + ylim=(0, 20), yticks=np.arange(20)) + ax.plot(np.arange(20), np.arange(20)) + plt.savefig("res/" + entry[0][0] + "_" + entry[0][1] + "_eq_strong") + + fig, ax = plt.subplots() + + ax.fill_between(index, means_row - variance_row, means_row + variance_row, alpha=.5, linewidth=0) + ax.plot(index, means_row, linewidth=2) + ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), + ylim=(0, 20), yticks=np.arange(20)) + ax.plot(np.arange(20), np.arange(20)) + + plt.savefig("res/" + entry[0][0] + "_" + entry[0][1] + "_row_strong") + + +for entry in weak_data.items(): + if (len(entry[1]) == 1): + continue + start = next(filter(lambda x: x[0] == '1', entry[1]), None) + if (start == None): + continue + base_eq = np.mean(start[2]) + base_row = np.mean(start[3]) + + sorted_list = entry[1] + sorted_list = sorted(sorted_list, key=cmp_to_key(lambda x, y: int(x[0]) - int(y[0]))) + + index = [] + means_eq = [] + variance_eq = [] + means_row = [] + variance_row = [] + for info in sorted_list: + index.append(int(info[0])) + speedup_eq = base_eq / info[2] + speedup_row = base_row / info[3] + means_eq.append(np.mean(speedup_eq)) + variance_eq.append(np.var(speedup_eq)) + means_row.append(np.mean(speedup_row)) + variance_row.append(np.var(speedup_row)) + index = np.array(index) + means_eq = np.array(means_eq) + variance_eq = np.array(variance_eq) + means_row = np.array(means_row) + variance_row = np.array(variance_row) + fig, ax = plt.subplots() + + ax.fill_between(index, means_eq - variance_eq, means_eq + variance_eq, alpha=.5, linewidth=0) + ax.plot(index, means_eq, linewidth=2) + ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), + ylim=(0, 1.1), yticks=np.arange(0, 1, 0.2)) + ax.plot(np.arange(20), np.zeros(20) + 1) + plt.savefig("res/" + str(entry[0]) + "_ratio_eq_weak") + + fig, ax = plt.subplots() + + ax.fill_between(index, means_row - variance_row, means_row + variance_row, alpha=.5, linewidth=0) + ax.plot(index, means_row, linewidth=2) + ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), + ylim=(0, 1.1), yticks=np.arange(0, 1, 0.2)) + ax.plot(np.arange(20), np.zeros(20) + 1) + plt.savefig("res/" + str(entry[0]) + "_ratio_row_weak") + diff --git a/run_benchmarks.sh b/run_benchmarks.sh new file mode 100755 index 0000000000..d6673a5b76 --- /dev/null +++ b/run_benchmarks.sh @@ -0,0 +1,32 @@ +#!/bin/sh +entry=$1 +# for i in {0..9}; do +# echo "processing $i random" +# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ 10000 $((10000 * (1 + 2 * $i))) 0.01 +# done + +# for i in {1..8}; do +# echo "processing $i bench weak" +# mpirun -n $i ./build/examples/mp/sparse_benchmark ./dest/ $(($i * 50000)) 4000 0 +# done + +for i in {1..8}; do + echo "processing $i bench strong" + mpirun -n $i ./build/examples/mp/sparse_benchmark ./dest/ 100000 10000 0 +done + + +# for i in {0..9}; do +# echo "processing $i bench weak" +# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ $(((1 + 2 * $i) * 100000)) 2000 0 +# done + +# for i in {0..9}; do +# echo "processing $i bench strong" +# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ 100000 10000 0 +# done + +# for i in {0..9}; do +# echo "processing $i $entry" +# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ $entry +# done \ No newline at end of file From 94f818e9c5a65bb04e051cb6e99fa532b791adc7 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 18 Sep 2024 11:26:11 +0200 Subject: [PATCH 16/68] Add gemv benchmark to gbench --- benchmarks/gbench/mp/CMakeLists.txt | 3 +- benchmarks/gbench/mp/gemv.cpp | 200 ++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 benchmarks/gbench/mp/gemv.cpp diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index c3ae2f682c..adbf32e34e 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -15,6 +15,7 @@ add_executable( ../common/stream.cpp streammp.cpp rooted.cpp + gemv.cpp stencil_1d.cpp stencil_2d.cpp chunk.cpp @@ -41,7 +42,7 @@ endif() # mp-quick-bench is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. -add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) +add_executable(mp-quick-bench mp-bench.cpp gemv.cpp) foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp new file mode 100644 index 0000000000..e0c40efd8d --- /dev/null +++ b/benchmarks/gbench/mp/gemv.cpp @@ -0,0 +1,200 @@ + +#include "mpi.h" + +#include "dr/mp.hpp" +#include +#include +#include +#include +#include + +#ifdef STANDALONE_BENCHMARK + +MPI_Comm comm; +int comm_rank; +int comm_size; + +#else + +#include "../common/dr_bench.hpp" + +#endif + +namespace mp = dr::mp; + +#ifdef STANDALONE_BENCHMARK +int main(int argc, char **argv) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + if (argc != 3 && argc != 5) { + fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market file], or ./sparse_benchmark [test outcome dir] [number of rows] [number of columns] [number of lower bands] [number of upper bands]\n"); + return 1; + } + +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + mp::init(q); +#else + mp::init(); +#endif + dr::views::csr_matrix_view local_data; + std::stringstream filenamestream; + auto root = 0; + auto computeSize = dr::mp::default_comm().size(); + if (root == dr::mp::default_comm().rank()) { + if (argc == 5) { + fmt::print("started loading\n"); + auto n = std::stoul(argv[2]); + auto up = std::stoul(argv[3]); + auto down = std::stoul(argv[4]); + // local_data = dr::generate_random_csr({n, m}, density, 42); + local_data = dr::generate_band_csr(n, up, down); + filenamestream << "mp_band_" << computeSize << "_" << n << "_" << up + down << "_" << local_data.size(); + fmt::print("finished loading\n"); + } + else { + fmt::print("started loading\n"); + std::string fname(argv[2]); + std::filesystem::path p(argv[2]); + local_data = dr::read_csr(fname); + filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" << local_data.size(); + fmt::print("finished loading\n"); + } + } + std::string resname; +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m_eq(local_data, root); +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + fmt::print("finished distribution\n"); + std::vector eq_duration; + std::vector row_duration; + + auto N = 10; + std::vector b; + b.reserve(m_row.shape().second); + std::vector res(m_row.shape().first); + for (auto i = 0; i < m_row.shape().second; i++) { + b.push_back(i); + } + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m_row.shape().second, 0, b, dr::mp::default_comm()); + + fmt::print("started initial gemv distribution\n"); + gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work + + fmt::print("finished initial gemv distribution\n"); + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_eq, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + eq_duration.push_back(duration); + } + + gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_row, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + row_duration.push_back(duration); + } + + if (root == dr::mp::default_comm().rank()) { + std::string tmp; + filenamestream >> tmp; + std::filesystem::path p(argv[1]); + p += tmp; + p += ".csv"; + std::ofstream write_stream(p.string()); + write_stream << eq_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << eq_duration[i]; + } + write_stream << "\n"; + write_stream << row_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << row_duration[i]; + } + write_stream << "\n"; + } + allocated_b.destroy_data(); + mp::finalize(); +} + +#else + + +static void GEMV_EQ_DR(benchmark::State &state) { + // fft requires usm shared allocation + std::size_t n = default_vector_size; + std::size_t up = default_vector_size / 10; + std::size_t down = default_vector_size / 10; + assert(dr::mp::use_sycl()); + dr::views::csr_matrix_view local_data; + local_data = dr::generate_band_csr(n, up, down); + + +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, 0); + std::vector b; + b.reserve(m.shape().second); + std::vector res(m.shape().first); + for (auto i = 0; i < m.shape().second; i++) { + b.push_back(i); + } + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + + for (auto _ : state) { + gemv(0, res, m, allocated_b); + } +} + +DR_BENCHMARK(GEMV_EQ_DR); + +static void GEMV_ROW_DR(benchmark::State &state) { + // fft requires usm shared allocation + std::size_t n = 100000; + std::size_t up = 10000; + std::size_t down = 10000; + assert(dr::mp::use_sycl()); + dr::views::csr_matrix_view local_data; + local_data = dr::generate_band_csr(n, up, down); + + +mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m(local_data, 0); + std::vector b; + b.reserve(m.shape().second); + std::vector res(m.shape().first); + for (auto i = 0; i < m.shape().second; i++) { + b.push_back(i); + } + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + + for (auto _ : state) { + gemv(0, res, m, allocated_b); + } +} + +DR_BENCHMARK(GEMV_ROW_DR); + +#endif \ No newline at end of file From 982a0e0d26a1c8c3c0584e85d341d1a26344d539 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 24 Sep 2024 01:28:40 -0700 Subject: [PATCH 17/68] Add reference gemv implementation --- benchmarks/gbench/mp/gemv.cpp | 71 +++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index e0c40efd8d..f7497398bf 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -135,7 +135,7 @@ mp::distributed_sparse_matrix< #else -static void GEMV_EQ_DR(benchmark::State &state) { +static void GemvEq_DR(benchmark::State &state) { // fft requires usm shared allocation std::size_t n = default_vector_size; std::size_t up = default_vector_size / 10; @@ -164,13 +164,13 @@ mp::distributed_sparse_matrix< } } -DR_BENCHMARK(GEMV_EQ_DR); +DR_BENCHMARK(GemvEq_DR); -static void GEMV_ROW_DR(benchmark::State &state) { +static void GemvRow_DR(benchmark::State &state) { // fft requires usm shared allocation - std::size_t n = 100000; - std::size_t up = 10000; - std::size_t down = 10000; + std::size_t n = default_vector_size; + std::size_t up = default_vector_size / 10; + std::size_t down = default_vector_size / 10; assert(dr::mp::use_sycl()); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); @@ -195,6 +195,63 @@ mp::distributed_sparse_matrix< } } -DR_BENCHMARK(GEMV_ROW_DR); +DR_BENCHMARK(GemvRow_DR); + + + +static void Gemv_reference(benchmark::State &state) { + T actual{}; + std::size_t n = default_vector_size; + std::size_t up = default_vector_size / 10; + std::size_t down = default_vector_size / 10; + assert(dr::mp::use_sycl()); + dr::views::csr_matrix_view local_data; + local_data = dr::generate_band_csr(n, up, down); + auto nnz_count = local_data.size(); + auto band_shape = local_data.shape(); + auto q = get_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(q); + auto val_ptr = sycl::malloc_device(nnz_count, q); + auto col_ptr = sycl::malloc_device(nnz_count, q); + auto row_ptr = sycl::malloc_device(band_shape[0], q); + std::vector b(band_shape[1]); + for (auto i = 0; i < band_shape[1]; i++) { + b.push_back(i); + } + auto input = sycl::malloc_device(band_shape[1], q); + auto output = sycl::malloc_device(band_shape[0], q); + + std::copy(policy, local_src.values_data(), local_src.values_data() + nnz_count, val_ptr); + std::copy(policy, local_src.colind_data(), local_src.colind_data() + nnz_count, col_ptr); + std::copy(policy, local_src.rowptr_data(), local_src.rowptr_data() + band_shape[0], row_ptr); + std::copy(policy, b.begin(), b.end(), input); + + for (auto _ : state) { + q.fill(output, 0, band_shape[0]).wait(); + q.submit([&](auto &cgh) { + cgh.parallel_for(sycl::range<1>{band_shape[0]}, [=](auto idx) { + double sum = 0; + for (auto i = rows_data[idx]; i < rows_data[idx + 1]; i++) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; + } + *(res + idx) += sum; + }); + }) + .wait(); + } + sycl::free(val_ptr, q); + sycl::free(col_ptr, q); + sycl::free(row_ptr, q); + sycl::free(input, q); + sycl::free(output, q); +} + + +DR_BENCHMARK(GemvEq_Reference); + +DR_BENCHMARK(GemvRow_Reference); #endif \ No newline at end of file From 47a84556971fefd6fa0b1a68f583bf2987a3c545 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 24 Sep 2024 10:46:53 +0200 Subject: [PATCH 18/68] Fixed gemv reference --- benchmarks/gbench/mp/gemv.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index f7497398bf..6febcfa977 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -199,8 +199,7 @@ DR_BENCHMARK(GemvRow_DR); -static void Gemv_reference(benchmark::State &state) { - T actual{}; +static void Gemv_Reference(benchmark::State &state) { std::size_t n = default_vector_size; std::size_t up = default_vector_size / 10; std::size_t down = default_vector_size / 10; @@ -221,23 +220,23 @@ static void Gemv_reference(benchmark::State &state) { auto input = sycl::malloc_device(band_shape[1], q); auto output = sycl::malloc_device(band_shape[0], q); - std::copy(policy, local_src.values_data(), local_src.values_data() + nnz_count, val_ptr); - std::copy(policy, local_src.colind_data(), local_src.colind_data() + nnz_count, col_ptr); - std::copy(policy, local_src.rowptr_data(), local_src.rowptr_data() + band_shape[0], row_ptr); + std::copy(policy, local_data.values_data(), local_data.values_data() + nnz_count, val_ptr); + std::copy(policy, local_data.colind_data(), local_data.colind_data() + nnz_count, col_ptr); + std::copy(policy, local_data.rowptr_data(), local_data.rowptr_data() + band_shape[0], row_ptr); std::copy(policy, b.begin(), b.end(), input); for (auto _ : state) { q.fill(output, 0, band_shape[0]).wait(); q.submit([&](auto &cgh) { - cgh.parallel_for(sycl::range<1>{band_shape[0]}, [=](auto idx) { + cgh.parallel_for(sycl::range<1>{static_cast(band_shape[0])}, [=](auto idx) { double sum = 0; - for (auto i = rows_data[idx]; i < rows_data[idx + 1]; i++) { - auto colNum = local_cols[i]; - auto matrixVal = vals[colNum]; - auto vectorVal = local_vals[i]; + for (auto i = row_ptr[idx]; i < row_ptr[idx + 1]; i++) { + auto colNum = col_ptr[i]; + auto matrixVal = input[colNum]; + auto vectorVal = val_ptr[i]; sum += matrixVal * vectorVal; } - *(res + idx) += sum; + *(output + idx) += sum; }); }) .wait(); @@ -249,6 +248,13 @@ static void Gemv_reference(benchmark::State &state) { sycl::free(output, q); } +static void GemvEq_Reference(benchmark::State &state) { + Gemv_Reference(state); +} + +static void GemvRow_Reference(benchmark::State &state) { + Gemv_Reference(state); +} DR_BENCHMARK(GemvEq_Reference); From a97a97b3ca0dc497ab9d40828db1f2c09d64fe00 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 25 Sep 2024 07:25:53 -0700 Subject: [PATCH 19/68] Fixed gemv benchmark implementation --- benchmarks/gbench/mp/gemv.cpp | 42 +++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 6febcfa977..99b9b6a034 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -138,8 +138,8 @@ mp::distributed_sparse_matrix< static void GemvEq_DR(benchmark::State &state) { // fft requires usm shared allocation std::size_t n = default_vector_size; - std::size_t up = default_vector_size / 10; - std::size_t down = default_vector_size / 10; + std::size_t up = n / 10; + std::size_t down = n / 10; assert(dr::mp::use_sycl()); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); @@ -169,8 +169,8 @@ DR_BENCHMARK(GemvEq_DR); static void GemvRow_DR(benchmark::State &state) { // fft requires usm shared allocation std::size_t n = default_vector_size; - std::size_t up = default_vector_size / 10; - std::size_t down = default_vector_size / 10; + std::size_t up = n / 10; + std::size_t down = n / 10; assert(dr::mp::use_sycl()); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); @@ -201,8 +201,8 @@ DR_BENCHMARK(GemvRow_DR); static void Gemv_Reference(benchmark::State &state) { std::size_t n = default_vector_size; - std::size_t up = default_vector_size / 10; - std::size_t down = default_vector_size / 10; + std::size_t up = n / 10; + std::size_t down = n / 10; assert(dr::mp::use_sycl()); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); @@ -212,35 +212,49 @@ static void Gemv_Reference(benchmark::State &state) { auto policy = oneapi::dpl::execution::make_device_policy(q); auto val_ptr = sycl::malloc_device(nnz_count, q); auto col_ptr = sycl::malloc_device(nnz_count, q); - auto row_ptr = sycl::malloc_device(band_shape[0], q); - std::vector b(band_shape[1]); + auto row_ptr = sycl::malloc_device((band_shape[0] + 1), q); + std::vector b; for (auto i = 0; i < band_shape[1]; i++) { b.push_back(i); } + double* elems = new double[band_shape[0]]; auto input = sycl::malloc_device(band_shape[1], q); auto output = sycl::malloc_device(band_shape[0], q); + // for (int i = 0; i < band_shape[0]; i++) { + // fmt::print("{} {}\n", i, local_data.rowptr_data()[i]); + // } + q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)).wait(); + q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); + q.memcpy(row_ptr, local_data.rowptr_data(), (band_shape[0] + 1) * sizeof(long)).wait(); + // std::copy(policy, local_data.values_data(), local_data.values_data() + nnz_count, val_ptr); + // std::copy(policy, local_data.colind_data(), local_data.colind_data() + nnz_count, col_ptr); + // std::copy(policy, local_data.rowptr_data(), local_data.rowptr_data() + band_shape[0], row_ptr); - std::copy(policy, local_data.values_data(), local_data.values_data() + nnz_count, val_ptr); - std::copy(policy, local_data.colind_data(), local_data.colind_data() + nnz_count, col_ptr); - std::copy(policy, local_data.rowptr_data(), local_data.rowptr_data() + band_shape[0], row_ptr); std::copy(policy, b.begin(), b.end(), input); + // for (int i = 0; i < band_shape[0]; i++) { + // fmt::print("{} {}\n", i, local_data.rowptr_data()[i + 1] - local_data.rowptr_data()[i]); + // } + for (auto _ : state) { - q.fill(output, 0, band_shape[0]).wait(); q.submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{static_cast(band_shape[0])}, [=](auto idx) { double sum = 0; - for (auto i = row_ptr[idx]; i < row_ptr[idx + 1]; i++) { + auto start = row_ptr[idx]; + auto end = row_ptr[idx + 1]; + for (auto i = start; i < end; i++) { auto colNum = col_ptr[i]; auto matrixVal = input[colNum]; auto vectorVal = val_ptr[i]; sum += matrixVal * vectorVal; } - *(output + idx) += sum; + *(output + idx) = sum; }); }) .wait(); + q.memcpy(elems, output, band_shape[0] * sizeof(double)).wait(); } + delete[] elems; sycl::free(val_ptr, q); sycl::free(col_ptr, q); sycl::free(row_ptr, q); From 231a09af30ba2f59febee3f6d1754e36c7d8b969 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 30 Sep 2024 11:33:36 +0200 Subject: [PATCH 20/68] Fix band csr generation --- include/dr/detail/generate_random_csr.hpp | 11 ++++------- .../matrix_formats/csr_eq_distribution.hpp | 13 ------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/include/dr/detail/generate_random_csr.hpp b/include/dr/detail/generate_random_csr.hpp index e99afdaf7a..116d3380a9 100644 --- a/include/dr/detail/generate_random_csr.hpp +++ b/include/dr/detail/generate_random_csr.hpp @@ -103,7 +103,7 @@ auto generate_random_csr(dr::index shape, double density = 0.01, template auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t down_band = 3) { - std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band - 1) / 2) - (down_band * (down_band - 1) / 2); + std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band + 1) / 2) - (down_band * (down_band + 1) / 2); T *values = new T[nnz]; I *rowptr = new I[size + 1]; @@ -114,12 +114,9 @@ auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t r = 0; std::size_t c = 0; for (auto i = 0; i < size; i++) { - for (auto j = i - down_band; j < i ; j++) { - if (j < 0) { - continue; - } + for (auto j = std::max(static_cast(i) - static_cast(down_band), static_cast(0)); j < i ; j++) { values[c] = 1; - colind[c] = j; + colind[c] = static_cast(j); c++; } values[c] = 1; @@ -133,7 +130,7 @@ auto generate_band_csr(I size, std::size_t up_band = 3, colind[c] = j; c++; } - rowptr[r + 1] = c + 1; + rowptr[r + 1] = c; r++; } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 69a2f789ed..d7c1dd3e70 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -63,19 +63,6 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - // dr::mp::sycl_queue().submit([&](auto& cgh) { - // cgh.parallel_for(sycl::range<1> { real_segment_size }, - // [=](auto idx) { - // auto colNum = localCols[idx]; - // auto matrixVal = vals[colNum]; - // auto vectorVal = localVals[idx]; - // auto row = - // rng::distance(std::upper_bound(local_data, - // local_data + row_size, offset + idx), local_data) - - // 1; - // *(res + row) += matrixVal * vectorVal; - // }); - // }).wait(); auto one_computation_size = (real_segment_size + max_row_size_ - 1) / max_row_size_; auto row_size = row_size_; From 6b8af49ab30b5fd0bb9a344786f0b4d3979c38e8 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 1 Oct 2024 09:56:41 +0200 Subject: [PATCH 21/68] Add support for slim matrix multiplication --- examples/mp/CMakeLists.txt | 1 + examples/mp/sparse_matrix_matrix_mul.cpp | 119 ++++++++++++++++++ include/dr/mp.hpp | 1 + include/dr/mp/algorithms/matrix/gemv.hpp | 14 ++- .../mp/containers/broadcasted_slim_matrix.hpp | 70 +++++++++++ .../dr/mp/containers/broadcasted_vector.hpp | 4 + .../containers/distributed_sparse_matrix.hpp | 6 +- .../matrix_formats/csr_eq_distribution.hpp | 89 +++++++------ .../matrix_formats/csr_row_distribution.hpp | 67 ++++++---- 9 files changed, 306 insertions(+), 65 deletions(-) create mode 100644 examples/mp/sparse_matrix_matrix_mul.cpp create mode 100644 include/dr/mp/containers/broadcasted_slim_matrix.hpp diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index c4af8c905c..f9e76928ae 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -28,6 +28,7 @@ add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) add_mp_example(sparse_matrix) add_mp_example(sparse_benchmark) +add_mp_example(sparse_matrix_matrix_mul) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp new file mode 100644 index 0000000000..e27b5f0ac1 --- /dev/null +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +namespace mp = dr::mp; + +int main(int argc, char **argv) { + + if (argc != 2) { + fmt::print("usage: ./sparse_matrix [matrix market file]\n"); + return 1; + } + + std::string fname(argv[1]); +#ifdef SYCL_LANGUAGE_VERSION + mp::init(sycl::default_selector_v); +#else + mp::init(); +#endif + + dr::views::csr_matrix_view local_data; + auto root = 0; + // if (root == dr::mp::default_comm().rank()) { + local_data = dr::read_csr(fname); + // } + { + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + fmt::print("{}\n", m.size()); + + auto width = 10; + std::vector res(m.shape().first * width); + std::vector res_row(m.shape().first * width); + std::vector base_a(m.shape().second * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < m.shape().second; i++) { + base_a[i + j * m.shape().second] = i*j + 1; + } + } + + + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(m_row.shape().second, width, 0, base_a, dr::mp::default_comm()); + m.fence(); + double total_time = 0; + auto N = 1; + gemv(0, res, m, allocated_a); // it is here to prepare sycl for work + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("eq canary {}\n", duration * 1000); + } + } + if (root == dr::mp::default_comm().rank()) { + fmt::print("eq gemv time total {}\n", total_time * 1000 / N); + } + m.fence(); + total_time = 0; + gemv(0, res_row, m_row, allocated_a); + for (int i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res_row, m_row, allocated_a); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + total_time += duration; + if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { + fmt::print("row canary {}\n", duration * 1000); + } + } + + if (root == dr::mp::default_comm().rank()) { + fmt::print("row gemv time total {}\n", total_time * 1000 / N); + } + m_row.fence(); + + std::vector ref(m.shape().first * width); + auto res_col_len = m.shape().first; + if (dr::mp::default_comm().rank() == 0) { + for (auto a : local_data) { + auto [index, val] = a; + auto [m, n] = index; + for (int i = 0; i < width; i++) { + ref[m + i * res_col_len] += base_a[n + i * res_col_len] * val; + } + } + for (int i = 0; i < m.shape().first * width; i++) { + if (res[i] != ref[i]) { + fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]); + } + } + for (int i = 0; i < m.shape().first * width; i++) { + if (res_row[i] != ref[i]) { + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], ref[i]); + } + } + } + allocated_a.destroy_data(); + } + + if (root == dr::mp::default_comm().rank()) { + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } + mp::finalize(); + + return 0; +} diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index 490fa182d7..f74326c35f 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -84,4 +84,5 @@ #include #include #include +#include #include diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index 7c9befcc58..8d40af762e 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace dr::mp { @@ -30,7 +31,7 @@ void gemv(int root, C &res, // } // communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); - a.local_gemv_and_collect(root, res, b.broadcasted_data()); + a.local_gemv_and_collect(root, res, b.broadcasted_data(), 1); // alloc.deallocate(broadcasted_b, a.shape().second); // a.fence(); @@ -41,4 +42,15 @@ void gemv(int root, C &res, // } } +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(int root, C &res, + distributed_sparse_matrix &a, broadcasted_slim_matrix b) { + if (default_comm().rank() == root) { + assert(a.shape().first * b.width() == res.size()); + } + a.local_gemv_and_collect(root, res, b.broadcasted_data(), b.width()); +} + } // namespace dr::mp diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp new file mode 100644 index 0000000000..1710a88599 --- /dev/null +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +namespace dr::mp { + + +template > +class broadcasted_slim_matrix { + public: + broadcasted_slim_matrix() = default; + + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, T** root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + for (auto i = 0; i < width; i++) { + rng::copy(root_data[i], root_data[i] + height, _data); + } + } + comm.bcast(_data, sizeof(T) * _data_size, root); + } + + template + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, R root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); + } + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + rng::copy(root_data.begin(), root_data.end(), _data); + } + comm.bcast(_data, sizeof(T) * _data_size, root); + } + + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T* operator[](std::size_t index) { + return _data + _height * index; + } + + T* broadcasted_data() { + return _data; + } + auto width() { + return _width; + } + private: + T* _data = nullptr; + std::size_t _data_size = 0; + std::size_t _width = 0; + std::size_t _height = 0; + + Allocator alloc; +}; +} \ No newline at end of file diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp index cbcc061cbb..106be89ee9 100644 --- a/include/dr/mp/containers/broadcasted_vector.hpp +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -31,6 +31,10 @@ class broadcasted_vector { _data = nullptr; } + T& operator[](std::size_t index) { + return _data[index]; + } + T* broadcasted_data() { return _data; } diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index 89b8ceb367..ffc208924d 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -23,7 +23,7 @@ concept matrix_distibution = requires(T t, std::vector res, int *input) { template concept vector_multiplicable = requires(T t, std::vector res, T::elem_type *input) { - t.local_gemv_and_collect(int(), res, input); + t.local_gemv_and_collect(int(), res, input, 1); }; template requires(vector_multiplicable) - auto local_gemv_and_collect(std::size_t root, C &res, T* vals) const { - distribution_.local_gemv_and_collect(root, res, vals); + auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t val_width) const { + distribution_.local_gemv_and_collect(root, res, vals, val_width); } private: diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index d7c1dd3e70..e426504bc2 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -48,12 +48,14 @@ class csr_eq_distribution { auto shape() const { return shape_; } void fence() { rows_backend_.fence(); } - template auto local_gemv(C &res, T* vals) const { + template auto local_gemv(C &res, T* vals, std::size_t vals_width) const { auto rank = rows_backend_.getrank(); if (nnz_ <= segment_size_ * rank) { return; } + auto vals_len = shape_[1]; auto size = row_sizes_[rank]; + auto res_col_len = max_row_size_; if (dr::mp::use_sycl()) { auto localVals = dr::__detail::direct_iterator( dr::mp::local_segment(*vals_data_).begin()); @@ -66,6 +68,7 @@ class csr_eq_distribution { auto one_computation_size = (real_segment_size + max_row_size_ - 1) / max_row_size_; auto row_size = row_size_; + dr::mp::sycl_queue() .submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{max_row_size_}, [=](auto idx) { @@ -77,28 +80,31 @@ class csr_eq_distribution { local_data, std::upper_bound( local_data, local_data + row_size, position) - 1); - auto row = first_row; - T sum = 0; - for (auto i = lower_bound; i < upper_bound; i++) { - while (row + 1 < row_size && - local_data[row + 1] <= offset + i) { - sycl::atomic_ref - c_ref(res[row]); - c_ref += sum; - row++; - sum = 0; - } - auto colNum = localCols[i]; - auto matrixVal = vals[colNum]; - auto vectorVal = localVals[i]; + for (auto j = 0; j < vals_width; j++) { + auto row = first_row; + T sum = 0; + + for (auto i = lower_bound; i < upper_bound; i++) { + while (row + 1 < row_size && + local_data[row + 1] <= offset + i) { + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; + row++; + sum = 0; + } + auto colNum = localCols[i] + j * vals_len; + auto matrixVal = vals[colNum]; + auto vectorVal = localVals[i]; - sum += matrixVal * vectorVal; + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; } - sycl::atomic_ref - c_ref(res[row]); - c_ref += sum; }); }) .wait(); @@ -115,7 +121,9 @@ class csr_eq_distribution { row_i++; current_row_position = rows_data_[row_i + 1]; } - res[row_i] += local_vals[i] * vals[local_cols[i]]; + for (int j = 0; j < vals_width; j++) { + res[row_i + j * res_col_len] += local_vals[i] * vals[local_cols[i] + j * vals_len]; + } } // fmt::print("offset, rank {} {}\n", row_offsets_[ @@ -128,46 +136,49 @@ class csr_eq_distribution { } template - auto local_gemv_and_collect(std::size_t root, C &res, T* vals) const { - assert(res.size() == shape_.first); + auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t vals_width) const { + assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate(max_row_size_); - for (auto i = 0; i < max_row_size_; i++) { + auto res_alloc = alloc.allocate(max_row_size_ * vals_width); + for (auto i = 0; i < max_row_size_ * vals_width; i++) { res_alloc[i] = 0; } - local_gemv(res_alloc, vals); + local_gemv(res_alloc, vals, vals_width); - gather_gemv_vector(root, res, res_alloc); - alloc.deallocate(res_alloc, max_row_size_); + gather_gemv_vector(root, res, res_alloc, vals_width); + alloc.deallocate(res_alloc, max_row_size_ * vals_width); } private: friend csr_eq_segment_iterator; template - void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, std::size_t vals_width) const { auto communicator = default_comm(); __detail::allocator alloc; if (communicator.rank() == root) { - auto gathered_res = alloc.allocate(max_row_size_ * communicator.size()); - communicator.gather(partial_res, gathered_res, max_row_size_, root); + auto gathered_res = alloc.allocate(max_row_size_ * communicator.size() * vals_width); + communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); rng::fill(res, 0); // auto begin = std::chrono::high_resolution_clock::now(); - for (auto i = 0; i < communicator.size(); i++) { - auto first_row = row_offsets_[i]; - auto last_row = row_offsets_[i] + row_sizes_[i]; - for (auto j = first_row; j < last_row; j++) { - res[j] += gathered_res[max_row_size_ * i + j - first_row]; + for (auto k = 0; k < vals_width; k++) { + for (auto i = 0; i < communicator.size(); i++) { + auto first_row = row_offsets_[i]; + auto last_row = row_offsets_[i] + row_sizes_[i]; + for (auto j = first_row; j < last_row; j++) { + res[j + k * shape_[1]] += gathered_res[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; + } } } + // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count(); // fmt::print("gather time {}\n", duration); - alloc.deallocate(gathered_res, max_row_size_ * communicator.size()); + alloc.deallocate(gathered_res, max_row_size_ * communicator.size() * vals_width); } else { - communicator.gather(partial_res, static_cast(nullptr), max_row_size_, + communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, root); } } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index aa45f24e91..393ce0cd7d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -58,11 +58,12 @@ class csr_row_distribution { vals_backend_.fence(); cols_backend_.fence(); } - template auto local_gemv(C &res, T* vals) const { + template auto local_gemv(C &res, T* vals, std::size_t vals_width) const { auto rank = cols_backend_.getrank(); if (shape_[0] <= segment_size_ * rank) return; auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank); + auto vals_len = shape_[1]; if (dr::mp::use_sycl()) { auto local_vals = vals_data_; auto local_cols = cols_data_; @@ -70,11 +71,11 @@ class csr_row_distribution { auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]); auto rows_data = dr::__detail::direct_iterator( dr::mp::local_segment(*rows_data_).begin()); + auto res_col_len = segment_size_; dr::mp::sycl_queue() .submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{size}, [=](auto idx) { std::size_t lower_bound = 0; - T sum = 0; if (rows_data[idx] > offset) { lower_bound = rows_data[idx] - offset; } @@ -82,13 +83,16 @@ class csr_row_distribution { if (idx < size - 1) { upper_bound = rows_data[idx + 1] - offset; } - for (auto i = lower_bound; i < upper_bound; i++) { - auto colNum = local_cols[i]; - auto matrixVal = vals[colNum]; - auto vectorVal = local_vals[i]; - sum += matrixVal * vectorVal; + for (auto j = 0; j < vals_width; j++) { + T sum = 0; + for (auto i = lower_bound; i < upper_bound; i++) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum + j * vals_len]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; + } + *(res + idx + j * res_col_len) += sum; } - *(res + idx) += sum; }); }) .wait(); @@ -104,45 +108,64 @@ class csr_row_distribution { row_i++; current_row_position = local_rows[row_i + 1]; } - res[row_i] += vals_data_[i] * vals[cols_data_[i]]; + for (auto j = 0; j < vals_width; j++) { + res[row_i + j * segment_size_] += vals_data_[i] * vals[cols_data_[i] + j * vals_len]; + } } } } template - auto local_gemv_and_collect(std::size_t root, C &res, T* &vals) const { - assert(res.size() == shape_.first); + auto local_gemv_and_collect(std::size_t root, C &res, T* &vals, std::size_t vals_width) const { + assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate(segment_size_); - for (auto i = 0; i < segment_size_; i++) { + auto res_alloc = alloc.allocate(segment_size_ * vals_width); + for (auto i = 0; i < segment_size_ * vals_width; i++) { res_alloc[i] = 0; } // auto begin = std::chrono::high_resolution_clock::now(); - local_gemv(res_alloc, vals); + local_gemv(res_alloc, vals, vals_width); // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count(); // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); - gather_gemv_vector(root, res, res_alloc); - alloc.deallocate(res_alloc, segment_size_); + gather_gemv_vector(root, res, res_alloc, vals_width); + alloc.deallocate(res_alloc, segment_size_ * vals_width); } private: friend csr_row_segment_iterator; template - void gather_gemv_vector(std::size_t root, C &res, A &partial_res) const { + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, std::size_t vals_width) const { auto communicator = default_comm(); __detail::allocator alloc; + if (communicator.rank() == root) { - auto scratch = alloc.allocate(segment_size_ * default_comm().size()); - communicator.gather(partial_res, scratch, segment_size_, root); - std::copy(scratch, scratch + shape_.first, res.begin()); - alloc.deallocate(scratch, segment_size_ * communicator.size()); + auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); + communicator.gather(partial_res, scratch, segment_size_ * vals_width, root); + + for (auto j = 0; j < communicator.size(); j++) { + if (j * segment_size_ >= shape_.second) { + break; + } + auto comm_segment_size = std::min(segment_size_, shape_.second - j * segment_size_); + for (auto i = 0; i < vals_width; i++) { + auto piece_start = scratch + j * vals_width * segment_size_ + i * segment_size_; + std::copy(piece_start, piece_start + comm_segment_size, res.begin() + shape_.first * i + j * segment_size_); + } + } + // for (auto i = 0; i < segment_size_ * communicator.size() * vals_width; i++) { + // fmt::print("{} {} {}\n", i, scratch[i], segment_size_); + // } + // for (auto i = 0; i < vals_width * shape_.first; i++) { + // fmt::print("{} {} {}\n", i, res[i], segment_size_); + // } + alloc.deallocate(scratch, segment_size_ * communicator.size()* vals_width); } else { - communicator.gather(partial_res, static_cast(nullptr), segment_size_, + communicator.gather(partial_res, static_cast(nullptr), segment_size_ * vals_width, root); } } From 628aa078d4d6c435d440597baa604e70d2345fd2 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 1 Oct 2024 01:01:11 -0700 Subject: [PATCH 22/68] Fix benchmark and band csr generation --- benchmarks/gbench/mp/gemv.cpp | 2 ++ include/dr/detail/generate_random_csr.hpp | 11 ++++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 99b9b6a034..ba4de7e856 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -159,6 +159,7 @@ mp::distributed_sparse_matrix< dr::mp::broadcasted_vector allocated_b; allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + gemv(0, res, m, allocated_b); for (auto _ : state) { gemv(0, res, m, allocated_b); } @@ -190,6 +191,7 @@ mp::distributed_sparse_matrix< dr::mp::broadcasted_vector allocated_b; allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + gemv(0, res, m, allocated_b); for (auto _ : state) { gemv(0, res, m, allocated_b); } diff --git a/include/dr/detail/generate_random_csr.hpp b/include/dr/detail/generate_random_csr.hpp index e99afdaf7a..116d3380a9 100644 --- a/include/dr/detail/generate_random_csr.hpp +++ b/include/dr/detail/generate_random_csr.hpp @@ -103,7 +103,7 @@ auto generate_random_csr(dr::index shape, double density = 0.01, template auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t down_band = 3) { - std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band - 1) / 2) - (down_band * (down_band - 1) / 2); + std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band + 1) / 2) - (down_band * (down_band + 1) / 2); T *values = new T[nnz]; I *rowptr = new I[size + 1]; @@ -114,12 +114,9 @@ auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t r = 0; std::size_t c = 0; for (auto i = 0; i < size; i++) { - for (auto j = i - down_band; j < i ; j++) { - if (j < 0) { - continue; - } + for (auto j = std::max(static_cast(i) - static_cast(down_band), static_cast(0)); j < i ; j++) { values[c] = 1; - colind[c] = j; + colind[c] = static_cast(j); c++; } values[c] = 1; @@ -133,7 +130,7 @@ auto generate_band_csr(I size, std::size_t up_band = 3, colind[c] = j; c++; } - rowptr[r + 1] = c + 1; + rowptr[r + 1] = c; r++; } From 4f12327979d7df4e3361f1f99628f3885a12b03a Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 2 Oct 2024 11:33:34 +0200 Subject: [PATCH 23/68] Add support to device based computing in distributed sparse matrix --- .../dr/mp/containers/broadcasted_vector.hpp | 7 ++++- .../matrix_formats/csr_eq_distribution.hpp | 23 +++++++++++--- .../matrix_formats/csr_row_distribution.hpp | 31 ++++++++++++++++--- test/gtest/mp/CMakeLists.txt | 2 +- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp index 106be89ee9..70124626eb 100644 --- a/include/dr/mp/containers/broadcasted_vector.hpp +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -20,7 +20,12 @@ class broadcasted_vector { _data_size = data_size; _data = alloc.allocate(_data_size); if (comm.rank() == root) { - rng::copy(root_data.begin(), root_data.end(), _data); + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), std::to_address(root_data.end()), _data); + } + else { + rng::copy(root_data.begin(), root_data.end(), _data); + } } comm.bcast(_data, sizeof(T) * _data_size, root); } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index e426504bc2..50d5940aaa 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -140,12 +140,14 @@ class csr_eq_distribution { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; auto res_alloc = alloc.allocate(max_row_size_ * vals_width); - for (auto i = 0; i < max_row_size_ * vals_width; i++) { - res_alloc[i] = 0; + if (use_sycl()) { + sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width); + } + else { + std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); } local_gemv(res_alloc, vals, vals_width); - gather_gemv_vector(root, res, res_alloc, vals_width); alloc.deallocate(res_alloc, max_row_size_ * vals_width); } @@ -160,7 +162,17 @@ class csr_eq_distribution { if (communicator.rank() == root) { auto gathered_res = alloc.allocate(max_row_size_ * communicator.size() * vals_width); communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); + T* gathered_res_host; + + if (use_sycl()) { + gathered_res_host = new T[max_row_size_ * communicator.size() * vals_width]; + __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * communicator.size() * vals_width); + } + else { + gathered_res_host = gathered_res; + } rng::fill(res, 0); + // auto begin = std::chrono::high_resolution_clock::now(); for (auto k = 0; k < vals_width; k++) { @@ -168,7 +180,7 @@ class csr_eq_distribution { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[1]] += gathered_res[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; + res[j + k * shape_[1]] += gathered_res_host[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; } } } @@ -176,6 +188,9 @@ class csr_eq_distribution { // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count(); // fmt::print("gather time {}\n", duration); + if (use_sycl()) { + delete[] gathered_res_host; + } alloc.deallocate(gathered_res, max_row_size_ * communicator.size() * vals_width); } else { communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 393ce0cd7d..3302fb3dff 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -72,6 +72,8 @@ class csr_row_distribution { auto rows_data = dr::__detail::direct_iterator( dr::mp::local_segment(*rows_data_).begin()); auto res_col_len = segment_size_; + + auto begin = std::chrono::high_resolution_clock::now(); dr::mp::sycl_queue() .submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{size}, [=](auto idx) { @@ -96,6 +98,9 @@ class csr_row_distribution { }); }) .wait(); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + fmt::print("timeDuration b: {} {} {}\n", duration, size, real_segment_size * vals_width); } else { auto local_rows = dr::mp::local_segment(*rows_data_); auto val_count = val_sizes_[rank]; @@ -120,10 +125,13 @@ class csr_row_distribution { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; auto res_alloc = alloc.allocate(segment_size_ * vals_width); - for (auto i = 0; i < segment_size_ * vals_width; i++) { - res_alloc[i] = 0; + if (use_sycl()) { + sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width); } - + else { + std::fill(res_alloc, res_alloc + segment_size_ * vals_width, 0); + } + // auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals, vals_width); // auto end = std::chrono::high_resolution_clock::now(); @@ -152,9 +160,23 @@ class csr_row_distribution { break; } auto comm_segment_size = std::min(segment_size_, shape_.second - j * segment_size_); + T* temp = nullptr; + if (use_sycl()) { + temp = new T[res.size()]; + } for (auto i = 0; i < vals_width; i++) { auto piece_start = scratch + j * vals_width * segment_size_ + i * segment_size_; - std::copy(piece_start, piece_start + comm_segment_size, res.begin() + shape_.first * i + j * segment_size_); + + if (use_sycl()) { + __detail::sycl_copy(piece_start, temp + shape_.first * i + j * segment_size_, comm_segment_size); + } + else { + std::copy(piece_start, piece_start + comm_segment_size, res.begin() + shape_.first * i + j * segment_size_); + } + } + if (use_sycl()) { + std::copy(temp, temp + res.size(), res.begin()); + delete[] temp; } } // for (auto i = 0; i < segment_size_ * communicator.size() * vals_width; i++) { @@ -301,5 +323,6 @@ class csr_row_distribution { std::size_t nnz_; std::vector> segments_; std::shared_ptr> rows_data_; + }; } // namespace dr::mp diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index cef65af431..df60c99372 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -57,7 +57,7 @@ add_executable( add_executable(mp-quick-test mp-tests.cpp - ../common/equal.cpp + copy.cpp ) # cmake-format: on From 71bd33658414bc2b46f0f5e91e7c3479c77d2a1d Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 2 Oct 2024 11:57:03 +0200 Subject: [PATCH 24/68] add broadcasted slim matrix device memory support --- .../dr/mp/containers/broadcasted_slim_matrix.hpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp index 1710a88599..e3953c9b5b 100644 --- a/include/dr/mp/containers/broadcasted_slim_matrix.hpp +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -22,7 +22,12 @@ class broadcasted_slim_matrix { _data = alloc.allocate(_data_size); if (comm.rank() == root) { for (auto i = 0; i < width; i++) { - rng::copy(root_data[i], root_data[i] + height, _data); + if (use_sycl()) { + __detail::sycl_copy(root_data[i], root_data[i] + height, _data + height * i); + } + else { + rng::copy(root_data[i], root_data[i] + height, _data + height * i); + } } } comm.bcast(_data, sizeof(T) * _data_size, root); @@ -38,7 +43,12 @@ class broadcasted_slim_matrix { _width = width; _data = alloc.allocate(_data_size); if (comm.rank() == root) { - rng::copy(root_data.begin(), root_data.end(), _data); + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), std::to_address(root_data.end()), _data); + } + else { + rng::copy(root_data.begin(), root_data.end(), _data); + } } comm.bcast(_data, sizeof(T) * _data_size, root); } From 6f9692987ecef6e40e9cedb222817b791e7e9514 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 7 Oct 2024 09:10:01 +0200 Subject: [PATCH 25/68] Fix issue with inconsistent timing when using mp gemv --- .../containers/matrix_formats/csr_eq_distribution.hpp | 6 +++++- .../containers/matrix_formats/csr_row_distribution.hpp | 10 +++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 50d5940aaa..3c8afb1fdb 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -69,6 +69,7 @@ class csr_eq_distribution { (real_segment_size + max_row_size_ - 1) / max_row_size_; auto row_size = row_size_; + // auto begin = std::chrono::high_resolution_clock::now(); dr::mp::sycl_queue() .submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{max_row_size_}, [=](auto idx) { @@ -108,6 +109,9 @@ class csr_eq_distribution { }); }) .wait(); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count() * 1000; + // fmt::print("timeDuration eq: {} {} {} {}\n", duration, size, real_segment_size * vals_width, rank); } else { auto row_i = -1; auto position = segment_size_ * rank; @@ -141,7 +145,7 @@ class csr_eq_distribution { __detail::allocator alloc; auto res_alloc = alloc.allocate(max_row_size_ * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width); + sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width).wait(); } else { std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 3302fb3dff..708e49d9e0 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -73,7 +73,7 @@ class csr_row_distribution { dr::mp::local_segment(*rows_data_).begin()); auto res_col_len = segment_size_; - auto begin = std::chrono::high_resolution_clock::now(); + // auto begin = std::chrono::high_resolution_clock::now(); dr::mp::sycl_queue() .submit([&](auto &cgh) { cgh.parallel_for(sycl::range<1>{size}, [=](auto idx) { @@ -98,9 +98,9 @@ class csr_row_distribution { }); }) .wait(); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - fmt::print("timeDuration b: {} {} {}\n", duration, size, real_segment_size * vals_width); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count() * 1000; + // fmt::print("timeDuration row: {} {} {} {}\n", duration, size, real_segment_size * vals_width, rank); } else { auto local_rows = dr::mp::local_segment(*rows_data_); auto val_count = val_sizes_[rank]; @@ -126,7 +126,7 @@ class csr_row_distribution { __detail::allocator alloc; auto res_alloc = alloc.allocate(segment_size_ * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width); + sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width).wait(); } else { std::fill(res_alloc, res_alloc + segment_size_ * vals_width, 0); From 08a2247e0357bc99eb7ab984dfbe531462cce246 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 8 Oct 2024 03:37:00 -0700 Subject: [PATCH 26/68] Some fixes to sparse matrixes --- benchmarks/gbench/mp/gemv.cpp | 93 ++++++++++--------- examples/mp/sparse_matrix.cpp | 4 +- examples/mp/sparse_matrix_matrix_mul.cpp | 6 +- .../dr/mp/containers/distributed_vector.hpp | 4 +- .../matrix_formats/csr_eq_distribution.hpp | 13 +-- .../matrix_formats/csr_row_distribution.hpp | 33 ++++--- 6 files changed, 79 insertions(+), 74 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index ba4de7e856..8bdac55aff 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -137,10 +137,12 @@ mp::distributed_sparse_matrix< static void GemvEq_DR(benchmark::State &state) { // fft requires usm shared allocation - std::size_t n = default_vector_size; + std::size_t n = default_vector_size / 2; std::size_t up = n / 10; std::size_t down = n / 10; + std::size_t width = 8; assert(dr::mp::use_sycl()); + assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); @@ -148,20 +150,20 @@ static void GemvEq_DR(benchmark::State &state) { mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, dr::mp::csr_eq_distribution> - m(local_data, 0); - std::vector b; - b.reserve(m.shape().second); - std::vector res(m.shape().first); - for (auto i = 0; i < m.shape().second; i++) { - b.push_back(i); - } - - dr::mp::broadcasted_vector allocated_b; - allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + m(local_data, 0); + std::vector base_a(n * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i*j + 1; + } + } + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); - gemv(0, res, m, allocated_b); + std::vector res(m.shape().first * width); + gemv(0, res, m, allocated_a); for (auto _ : state) { - gemv(0, res, m, allocated_b); + gemv(0, res, m, allocated_a); } } @@ -169,31 +171,33 @@ DR_BENCHMARK(GemvEq_DR); static void GemvRow_DR(benchmark::State &state) { // fft requires usm shared allocation - std::size_t n = default_vector_size; + std::size_t n = default_vector_size / 2; std::size_t up = n / 10; std::size_t down = n / 10; + std::size_t width = 8; assert(dr::mp::use_sycl()); + assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); -mp::distributed_sparse_matrix< + mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, dr::mp::csr_row_distribution> - m(local_data, 0); - std::vector b; - b.reserve(m.shape().second); - std::vector res(m.shape().first); - for (auto i = 0; i < m.shape().second; i++) { - b.push_back(i); - } - - dr::mp::broadcasted_vector allocated_b; - allocated_b.broadcast_data(m.shape().second, 0, b, dr::mp::default_comm()); + m(local_data, 0); + std::vector base_a(n * width); + for (int j = 0; j < width; j++) { + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i*j + 1; + } + } + dr::mp::broadcasted_slim_matrix allocated_a; + allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); - gemv(0, res, m, allocated_b); + std::vector res(m.shape().first * width); + gemv(0, res, m, allocated_a); for (auto _ : state) { - gemv(0, res, m, allocated_b); + gemv(0, res, m, allocated_a); } } @@ -202,10 +206,12 @@ DR_BENCHMARK(GemvRow_DR); static void Gemv_Reference(benchmark::State &state) { - std::size_t n = default_vector_size; + std::size_t n = default_vector_size / 2; std::size_t up = n / 10; std::size_t down = n / 10; + std::size_t width = 8; assert(dr::mp::use_sycl()); + assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); dr::views::csr_matrix_view local_data; local_data = dr::generate_band_csr(n, up, down); auto nnz_count = local_data.size(); @@ -216,11 +222,11 @@ static void Gemv_Reference(benchmark::State &state) { auto col_ptr = sycl::malloc_device(nnz_count, q); auto row_ptr = sycl::malloc_device((band_shape[0] + 1), q); std::vector b; - for (auto i = 0; i < band_shape[1]; i++) { + for (auto i = 0; i < band_shape[1] * width; i++) { b.push_back(i); } double* elems = new double[band_shape[0]]; - auto input = sycl::malloc_device(band_shape[1], q); + auto input = sycl::malloc_device(band_shape[1] * width, q); auto output = sycl::malloc_device(band_shape[0], q); // for (int i = 0; i < band_shape[0]; i++) { // fmt::print("{} {}\n", i, local_data.rowptr_data()[i]); @@ -239,21 +245,20 @@ static void Gemv_Reference(benchmark::State &state) { for (auto _ : state) { - q.submit([&](auto &cgh) { - cgh.parallel_for(sycl::range<1>{static_cast(band_shape[0])}, [=](auto idx) { - double sum = 0; - auto start = row_ptr[idx]; - auto end = row_ptr[idx + 1]; - for (auto i = start; i < end; i++) { - auto colNum = col_ptr[i]; - auto matrixVal = input[colNum]; - auto vectorVal = val_ptr[i]; - sum += matrixVal * vectorVal; + dr::__detail::parallel_for_workaround(q, sycl::range<1>{static_cast(band_shape[0])}, [=](auto idx) { + for (auto j = 0; j < width; j++) { + double sum = 0; + auto start = row_ptr[idx]; + auto end = row_ptr[idx + 1]; + for (auto i = start; i < end; i++) { + auto colNum = col_ptr[i]; + auto vectorVal = input[colNum + j * band_shape[1]]; + auto matrixVal = val_ptr[i]; + sum += matrixVal * vectorVal; + } + *(output + idx) = sum; } - *(output + idx) = sum; - }); - }) - .wait(); + }).wait(); q.memcpy(elems, output, band_shape[0] * sizeof(double)).wait(); } delete[] elems; diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index e3163212d4..123ea9d2d7 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -114,12 +114,12 @@ int main(int argc, char **argv) { } for (int i = 0; i < m.shape().first; i++) { if (res[i] != ref[i]) { - fmt::print("mismatching outcome {} {}\n", res[i], ref[i]); + fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]); } } for (int i = 0; i < m.shape().first; i++) { if (res_row[i] != ref[i]) { - fmt::print("mismatching outcome row {} {}\n", res_row[i], ref[i]); + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], ref[i]); } } } diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp index e27b5f0ac1..738cc448a4 100644 --- a/examples/mp/sparse_matrix_matrix_mul.cpp +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -23,6 +23,10 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; + // auto n = 50000; + // std::size_t up = n / 10; + // std::size_t down = n / 10; + // local_data = dr::generate_band_csr(n, up, down); // if (root == dr::mp::default_comm().rank()) { local_data = dr::read_csr(fname); // } @@ -37,7 +41,7 @@ int main(int argc, char **argv) { m_row(local_data, root); fmt::print("{}\n", m.size()); - auto width = 10; + auto width = 8; std::vector res(m.shape().first * width); std::vector res_row(m.shape().first * width); std::vector base_a(m.shape().second * width); diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 822a1b1597..2f65185180 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -79,7 +79,7 @@ class MpiBackend { std::size_t getrank() const { return win_.communicator().rank(); } - void fence() { win_.fence(); } + void fence() const { win_.fence(); } }; #ifdef DRISHMEM @@ -127,7 +127,7 @@ class IshmemBackend { return my_process_segment_index; } - void fence() { + void fence() const { // TODO: to have locality use ishmemx_fence_work_group ishmem_fence(); } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 3c8afb1fdb..05902f08bb 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -46,7 +46,7 @@ class csr_eq_distribution { auto segments() const { return rng::views::all(segments_); } auto nnz() const { return nnz_; } auto shape() const { return shape_; } - void fence() { rows_backend_.fence(); } + void fence() const { rows_backend_.fence(); } template auto local_gemv(C &res, T* vals, std::size_t vals_width) const { auto rank = rows_backend_.getrank(); @@ -68,11 +68,9 @@ class csr_eq_distribution { auto one_computation_size = (real_segment_size + max_row_size_ - 1) / max_row_size_; auto row_size = row_size_; - // auto begin = std::chrono::high_resolution_clock::now(); - dr::mp::sycl_queue() - .submit([&](auto &cgh) { - cgh.parallel_for(sycl::range<1>{max_row_size_}, [=](auto idx) { + dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{max_row_size_}, + [=](auto idx) { std::size_t lower_bound = one_computation_size * idx; std::size_t upper_bound = std::min(one_computation_size * (idx + 1), real_segment_size); @@ -106,9 +104,7 @@ class csr_eq_distribution { c_ref(res[row + j * res_col_len]); c_ref += sum; } - }); - }) - .wait(); + }).wait(); // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count() * 1000; // fmt::print("timeDuration eq: {} {} {} {}\n", duration, size, real_segment_size * vals_width, rank); @@ -153,6 +149,7 @@ class csr_eq_distribution { local_gemv(res_alloc, vals, vals_width); gather_gemv_vector(root, res, res_alloc, vals_width); + fence(); alloc.deallocate(res_alloc, max_row_size_ * vals_width); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 708e49d9e0..17c3d5b60d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -54,7 +54,7 @@ class csr_row_distribution { auto segments() const { return rng::views::all(segments_); } auto nnz() const { return nnz_; } auto shape() const { return shape_; } - void fence() { + void fence() const { vals_backend_.fence(); cols_backend_.fence(); } @@ -74,9 +74,8 @@ class csr_row_distribution { auto res_col_len = segment_size_; // auto begin = std::chrono::high_resolution_clock::now(); - dr::mp::sycl_queue() - .submit([&](auto &cgh) { - cgh.parallel_for(sycl::range<1>{size}, [=](auto idx) { + dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{size}, + [=](auto idx) { std::size_t lower_bound = 0; if (rows_data[idx] > offset) { lower_bound = rows_data[idx] - offset; @@ -95,12 +94,11 @@ class csr_row_distribution { } *(res + idx + j * res_col_len) += sum; } - }); - }) - .wait(); + } + ).wait(); // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count() * 1000; - // fmt::print("timeDuration row: {} {} {} {}\n", duration, size, real_segment_size * vals_width, rank); + // fmt::print("timeDuration b: {} {} {}\n", duration, size, real_segment_size * vals_width); } else { auto local_rows = dr::mp::local_segment(*rows_data_); auto val_count = val_sizes_[rank]; @@ -140,6 +138,7 @@ class csr_row_distribution { // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); gather_gemv_vector(root, res, res_alloc, vals_width); + fence(); alloc.deallocate(res_alloc, segment_size_ * vals_width); } @@ -154,16 +153,16 @@ class csr_row_distribution { if (communicator.rank() == root) { auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); communicator.gather(partial_res, scratch, segment_size_ * vals_width, root); - + T* temp = nullptr; + if (use_sycl()) { + temp = new T[res.size()]; + } for (auto j = 0; j < communicator.size(); j++) { if (j * segment_size_ >= shape_.second) { break; } auto comm_segment_size = std::min(segment_size_, shape_.second - j * segment_size_); - T* temp = nullptr; - if (use_sycl()) { - temp = new T[res.size()]; - } + for (auto i = 0; i < vals_width; i++) { auto piece_start = scratch + j * vals_width * segment_size_ + i * segment_size_; @@ -174,10 +173,10 @@ class csr_row_distribution { std::copy(piece_start, piece_start + comm_segment_size, res.begin() + shape_.first * i + j * segment_size_); } } - if (use_sycl()) { - std::copy(temp, temp + res.size(), res.begin()); - delete[] temp; - } + } + if (use_sycl()) { + std::copy(temp, temp + res.size(), res.begin()); + delete[] temp; } // for (auto i = 0; i < segment_size_ * communicator.size() * vals_width; i++) { // fmt::print("{} {} {}\n", i, scratch[i], segment_size_); From 6a4bd3053ed7cd89a44e571e1df18a24646acb9c Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 9 Oct 2024 02:25:39 -0700 Subject: [PATCH 27/68] improve work division in csr eq distribution --- .../dr/mp/containers/matrix_formats/csr_eq_distribution.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 05902f08bb..e9ff7d00dd 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -65,11 +65,12 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; + auto division = std::max(real_segment_size / 100, max_row_size_ * 10); auto one_computation_size = - (real_segment_size + max_row_size_ - 1) / max_row_size_; + (real_segment_size + division - 1) / division; auto row_size = row_size_; // auto begin = std::chrono::high_resolution_clock::now(); - dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{max_row_size_}, + dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{division}, [=](auto idx) { std::size_t lower_bound = one_computation_size * idx; std::size_t upper_bound = From f93961b86f5bf29704063e2c5dacfeaa2e647e71 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 16 Oct 2024 00:50:24 -0700 Subject: [PATCH 28/68] Add better work distribution to csr_row_distiribution and fix distributed_vector rma memory access --- include/dr/mp/algorithms/matrix/gemv.hpp | 1 + .../dr/mp/containers/broadcasted_vector.hpp | 4 ++ .../dr/mp/containers/distributed_vector.hpp | 15 ++++++- .../matrix_formats/csr_eq_distribution.hpp | 1 + .../matrix_formats/csr_row_distribution.hpp | 40 ++++++++++++------- 5 files changed, 46 insertions(+), 15 deletions(-) diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index 8d40af762e..a9efb38934 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -21,6 +21,7 @@ void gemv(int root, C &res, distributed_sparse_matrix &a, broadcasted_vector b) { if (default_comm().rank() == root) { assert(a.shape().first == res.size()); + assert(a.shape().second == b.size()); } // copy b to all machines // auto communicator = default_comm(); diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp index 70124626eb..abe9c09a12 100644 --- a/include/dr/mp/containers/broadcasted_vector.hpp +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -43,6 +43,10 @@ class broadcasted_vector { T* broadcasted_data() { return _data; } + + auto size() { + return _data_size; + } private: T* _data = nullptr; std::size_t _data_size = 0; diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 2f65185180..19b91b3510 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -63,8 +63,21 @@ class MpiBackend { #if (MPI_VERSION >= 4) || \ (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) + if (mp::use_sycl()) { + // 32-bit API inside for sycl based buffers + for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { + std::size_t s = std::min(remainder, (std::size_t)INT_MAX); + DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", + default_comm().rank(), __LINE__, s, off, offset + off); + win_.put((uint8_t *)src + off, s, segment_index, offset + off); + off += s; + remainder -= s; + } + } + else { // 64-bit API inside - win_.put(src, datalen, segment_index, offset); + win_.put(src, datalen, segment_index, offset); + } #else for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { std::size_t s = std::min(remainder, (std::size_t)INT_MAX); diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index e9ff7d00dd..e36347b258 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -69,6 +69,7 @@ class csr_eq_distribution { auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; + // fmt::print("{} {} {}\n", division, real_segment_size / 100, max_row_size_ * 10); // auto begin = std::chrono::high_resolution_clock::now(); dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{division}, [=](auto idx) { diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 17c3d5b60d..6525db7780 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -72,10 +72,19 @@ class csr_row_distribution { auto rows_data = dr::__detail::direct_iterator( dr::mp::local_segment(*rows_data_).begin()); auto res_col_len = segment_size_; - + std::size_t wg = 32; + while (vals_width * size * wg > INT_MAX) { + // this check is necessary, because sycl does not permit ranges exceeding integer limit + wg /= 2; + } + assert(wg > 0); // auto begin = std::chrono::high_resolution_clock::now(); - dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{size}, - [=](auto idx) { + dr::mp::sycl_queue().submit([&](auto &&h) { + h.parallel_for(sycl::nd_range<1>(vals_width * size * wg, wg), [=](auto item) { + auto input_j = item.get_group(0) / size; + auto idx = item.get_group(0) % size; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); std::size_t lower_bound = 0; if (rows_data[idx] > offset) { lower_bound = rows_data[idx] - offset; @@ -84,18 +93,21 @@ class csr_row_distribution { if (idx < size - 1) { upper_bound = rows_data[idx + 1] - offset; } - for (auto j = 0; j < vals_width; j++) { - T sum = 0; - for (auto i = lower_bound; i < upper_bound; i++) { - auto colNum = local_cols[i]; - auto matrixVal = vals[colNum + j * vals_len]; - auto vectorVal = local_vals[i]; - sum += matrixVal * vectorVal; - } - *(res + idx + j * res_col_len) += sum; + T sum = 0; + for (auto i = lower_bound + local_id; i < upper_bound; i += group_size) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum + input_j * vals_len]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; } - } - ).wait(); + + sycl::atomic_ref + c_ref(res[idx + input_j * res_col_len]); + c_ref += sum; + }); + + }).wait(); // auto end = std::chrono::high_resolution_clock::now(); // double duration = std::chrono::duration(end - begin).count() * 1000; // fmt::print("timeDuration b: {} {} {}\n", duration, size, real_segment_size * vals_width); From e42152384604118f3a4d7f1186d852616a2fcfd9 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 18 Oct 2024 04:25:58 -0700 Subject: [PATCH 29/68] improve performance on less dense matrices and allow broadcasting bigger matrices --- include/dr/detail/communicator.hpp | 7 +++ .../mp/containers/broadcasted_slim_matrix.hpp | 10 ++- .../matrix_formats/csr_eq_distribution.hpp | 63 ++++++++++++------- 3 files changed, 58 insertions(+), 22 deletions(-) diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 74dabe05cf..1688ae31ca 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -168,6 +168,13 @@ class communicator { irecv(data, src_rank, 0, request); } + void wait(MPI_Request request) const { + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + void waitall(std::size_t count, MPI_Request *requests) const { + MPI_Waitall(count, requests, MPI_STATUS_IGNORE); + } + template void alltoall(const R &sendr, R &recvr, std::size_t count) { alltoall(rng::data(sendr), rng::data(recvr), count); diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp index e3953c9b5b..66a636ad7c 100644 --- a/include/dr/mp/containers/broadcasted_slim_matrix.hpp +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -50,7 +50,15 @@ class broadcasted_slim_matrix { rng::copy(root_data.begin(), root_data.end(), _data); } } - comm.bcast(_data, sizeof(T) * _data_size, root); + auto position = 0; + auto reminder = sizeof(T) * _data_size; + while (reminder > INT_MAX) { + comm.bcast(((uint8_t*)_data) + position, INT_MAX, root); + position += INT_MAX; + reminder -= INT_MAX; + } + comm.bcast(((uint8_t*)_data) + position, reminder, root); + } void destroy_data() { diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index e36347b258..eaebbc5dcf 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -55,7 +55,7 @@ class csr_eq_distribution { } auto vals_len = shape_[1]; auto size = row_sizes_[rank]; - auto res_col_len = max_row_size_; + auto res_col_len = size; if (dr::mp::use_sycl()) { auto localVals = dr::__detail::direct_iterator( dr::mp::local_segment(*vals_data_).begin()); @@ -65,7 +65,7 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - auto division = std::max(real_segment_size / 100, max_row_size_ * 10); + auto division = std::max(real_segment_size / 100, total_row_size_); auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; @@ -141,18 +141,18 @@ class csr_eq_distribution { auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t vals_width) const { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate(max_row_size_ * vals_width); + auto res_alloc = alloc.allocate(row_size_ * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width).wait(); + sycl_queue().fill(res_alloc, 0, row_size_ * vals_width).wait(); } else { - std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); + std::fill(res_alloc, res_alloc + row_size_ * vals_width, 0); } local_gemv(res_alloc, vals, vals_width); gather_gemv_vector(root, res, res_alloc, vals_width); fence(); - alloc.deallocate(res_alloc, max_row_size_ * vals_width); + alloc.deallocate(res_alloc, row_size_ * vals_width); } private: @@ -163,28 +163,48 @@ class csr_eq_distribution { auto communicator = default_comm(); __detail::allocator alloc; if (communicator.rank() == root) { - auto gathered_res = alloc.allocate(max_row_size_ * communicator.size() * vals_width); - communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); + auto requests = new MPI_Request[communicator.size() - 1]; + auto gathered_res = alloc.allocate(total_row_size_ * vals_width); + auto current_row = 0; + auto req_iter = 0; + for (auto i = 0; i < communicator.size(); i++) { + if (i == root) { + if (use_sycl()) { + __detail::sycl_copy(partial_res, gathered_res + current_row * vals_width, row_size_ * vals_width); + } + else { + std::copy(partial_res, partial_res + row_size_ * vals_width, gathered_res + current_row * vals_width); + } + } + + else { + communicator.irecv(gathered_res + current_row * vals_width, row_sizes_[i] * vals_width, i, requests + req_iter); + req_iter++; + } + current_row += row_sizes_[i]; + } + + communicator.waitall(communicator.size() - 1, requests); T* gathered_res_host; - if (use_sycl()) { - gathered_res_host = new T[max_row_size_ * communicator.size() * vals_width]; - __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * communicator.size() * vals_width); + gathered_res_host = new T[total_row_size_ * vals_width]; + __detail::sycl_copy(gathered_res, gathered_res_host, total_row_size_ * vals_width); } else { gathered_res_host = gathered_res; } rng::fill(res, 0); - - // auto begin = std::chrono::high_resolution_clock::now(); for (auto k = 0; k < vals_width; k++) { + current_row = 0; for (auto i = 0; i < communicator.size(); i++) { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; + auto current_row_size = row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[1]] += gathered_res_host[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; + res[j + k * shape_[1]] += gathered_res_host[vals_width * current_row + k * current_row_size + j - first_row]; } + current_row += current_row_size; } } @@ -194,10 +214,11 @@ class csr_eq_distribution { if (use_sycl()) { delete[] gathered_res_host; } - alloc.deallocate(gathered_res, max_row_size_ * communicator.size() * vals_width); + alloc.deallocate(gathered_res, total_row_size_ * communicator.size() * vals_width); } else { - communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, - root); + MPI_Request req; + communicator.isend(partial_res, row_size_ * vals_width, root, &req); + communicator.wait(req); } } @@ -254,9 +275,9 @@ class csr_eq_distribution { row_sizes_.push_back(higher_limit - lower_limit); row_information[i] = lower_limit; row_information[default_comm().size() + i] = higher_limit - lower_limit; - max_row_size_ = std::max(max_row_size_, row_sizes_.back()); + total_row_size_ = total_row_size_ + row_sizes_.back(); } - row_information[default_comm().size() * 2] = max_row_size_; + row_information[default_comm().size() * 2] = total_row_size_; default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); } else { @@ -266,7 +287,7 @@ class csr_eq_distribution { row_offsets_.push_back(row_information[i]); row_sizes_.push_back(row_information[default_comm().size() + i]); } - max_row_size_ = row_information[default_comm().size() * 2]; + total_row_size_ = row_information[default_comm().size() * 2]; } delete[] row_information; row_size_ = std::max(row_sizes_[rank], static_cast(1)); @@ -306,7 +327,7 @@ class csr_eq_distribution { std::size_t segment_size_ = 0; std::size_t row_size_ = 0; - std::size_t max_row_size_ = 0; + std::size_t total_row_size_ = 0; std::vector row_offsets_; std::vector row_sizes_; From 3fa4a68e6dab16d27efd5a270e7c3a23bef426ca Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 18 Oct 2024 05:26:14 -0700 Subject: [PATCH 30/68] Reversed change to eq distribution --- .../matrix_formats/csr_eq_distribution.hpp | 63 +++++++------------ 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index eaebbc5dcf..e36347b258 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -55,7 +55,7 @@ class csr_eq_distribution { } auto vals_len = shape_[1]; auto size = row_sizes_[rank]; - auto res_col_len = size; + auto res_col_len = max_row_size_; if (dr::mp::use_sycl()) { auto localVals = dr::__detail::direct_iterator( dr::mp::local_segment(*vals_data_).begin()); @@ -65,7 +65,7 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - auto division = std::max(real_segment_size / 100, total_row_size_); + auto division = std::max(real_segment_size / 100, max_row_size_ * 10); auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; @@ -141,18 +141,18 @@ class csr_eq_distribution { auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t vals_width) const { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate(row_size_ * vals_width); + auto res_alloc = alloc.allocate(max_row_size_ * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, row_size_ * vals_width).wait(); + sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width).wait(); } else { - std::fill(res_alloc, res_alloc + row_size_ * vals_width, 0); + std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); } local_gemv(res_alloc, vals, vals_width); gather_gemv_vector(root, res, res_alloc, vals_width); fence(); - alloc.deallocate(res_alloc, row_size_ * vals_width); + alloc.deallocate(res_alloc, max_row_size_ * vals_width); } private: @@ -163,48 +163,28 @@ class csr_eq_distribution { auto communicator = default_comm(); __detail::allocator alloc; if (communicator.rank() == root) { - auto requests = new MPI_Request[communicator.size() - 1]; - auto gathered_res = alloc.allocate(total_row_size_ * vals_width); - auto current_row = 0; - auto req_iter = 0; - for (auto i = 0; i < communicator.size(); i++) { - if (i == root) { - if (use_sycl()) { - __detail::sycl_copy(partial_res, gathered_res + current_row * vals_width, row_size_ * vals_width); - } - else { - std::copy(partial_res, partial_res + row_size_ * vals_width, gathered_res + current_row * vals_width); - } - } - - else { - communicator.irecv(gathered_res + current_row * vals_width, row_sizes_[i] * vals_width, i, requests + req_iter); - req_iter++; - } - current_row += row_sizes_[i]; - } - - communicator.waitall(communicator.size() - 1, requests); + auto gathered_res = alloc.allocate(max_row_size_ * communicator.size() * vals_width); + communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); T* gathered_res_host; + if (use_sycl()) { - gathered_res_host = new T[total_row_size_ * vals_width]; - __detail::sycl_copy(gathered_res, gathered_res_host, total_row_size_ * vals_width); + gathered_res_host = new T[max_row_size_ * communicator.size() * vals_width]; + __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * communicator.size() * vals_width); } else { gathered_res_host = gathered_res; } rng::fill(res, 0); + + // auto begin = std::chrono::high_resolution_clock::now(); for (auto k = 0; k < vals_width; k++) { - current_row = 0; for (auto i = 0; i < communicator.size(); i++) { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; - auto current_row_size = row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[1]] += gathered_res_host[vals_width * current_row + k * current_row_size + j - first_row]; + res[j + k * shape_[1]] += gathered_res_host[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; } - current_row += current_row_size; } } @@ -214,11 +194,10 @@ class csr_eq_distribution { if (use_sycl()) { delete[] gathered_res_host; } - alloc.deallocate(gathered_res, total_row_size_ * communicator.size() * vals_width); + alloc.deallocate(gathered_res, max_row_size_ * communicator.size() * vals_width); } else { - MPI_Request req; - communicator.isend(partial_res, row_size_ * vals_width, root, &req); - communicator.wait(req); + communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, + root); } } @@ -275,9 +254,9 @@ class csr_eq_distribution { row_sizes_.push_back(higher_limit - lower_limit); row_information[i] = lower_limit; row_information[default_comm().size() + i] = higher_limit - lower_limit; - total_row_size_ = total_row_size_ + row_sizes_.back(); + max_row_size_ = std::max(max_row_size_, row_sizes_.back()); } - row_information[default_comm().size() * 2] = total_row_size_; + row_information[default_comm().size() * 2] = max_row_size_; default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, root); } else { @@ -287,7 +266,7 @@ class csr_eq_distribution { row_offsets_.push_back(row_information[i]); row_sizes_.push_back(row_information[default_comm().size() + i]); } - total_row_size_ = row_information[default_comm().size() * 2]; + max_row_size_ = row_information[default_comm().size() * 2]; } delete[] row_information; row_size_ = std::max(row_sizes_[rank], static_cast(1)); @@ -327,7 +306,7 @@ class csr_eq_distribution { std::size_t segment_size_ = 0; std::size_t row_size_ = 0; - std::size_t total_row_size_ = 0; + std::size_t max_row_size_ = 0; std::vector row_offsets_; std::vector row_sizes_; From 0a1a4dcbf6be0ff9fedd91ac9fa9cf975185dc42 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 25 Oct 2024 03:06:45 -0700 Subject: [PATCH 31/68] update some examples and benchmarks --- benchmarks/gbench/mp/gemv.cpp | 117 ++++++++++++------ examples/mp/sparse_matrix_matrix_mul.cpp | 14 +-- .../matrix_formats/csr_eq_distribution.hpp | 6 + 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 8bdac55aff..232d89c2de 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -134,23 +134,34 @@ mp::distributed_sparse_matrix< #else +namespace { + std::size_t getWidth() { + // return 8; + return default_vector_size / 100000; + } +} +static auto getMatrix() { + // std::size_t n = std::sqrt(default_vector_size / 100000) * 50000; + // // std::size_t n = default_vector_size / 2; + // std::size_t up = n / 10; + // std::size_t down = n / 10; + // // assert(dr::mp::use_sycl()); + // // assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); + // return dr::generate_band_csr(n, up, down); + + return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); +} static void GemvEq_DR(benchmark::State &state) { - // fft requires usm shared allocation - std::size_t n = default_vector_size / 2; - std::size_t up = n / 10; - std::size_t down = n / 10; - std::size_t width = 8; - assert(dr::mp::use_sycl()); - assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - dr::views::csr_matrix_view local_data; - local_data = dr::generate_band_csr(n, up, down); + auto local_data = getMatrix(); mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, dr::mp::csr_eq_distribution> m(local_data, 0); + auto n = m.shape()[1]; + auto width = getWidth(); std::vector base_a(n * width); for (int j = 0; j < width; j++) { for (int i = 0; i < n; i++) { @@ -171,20 +182,15 @@ DR_BENCHMARK(GemvEq_DR); static void GemvRow_DR(benchmark::State &state) { // fft requires usm shared allocation - std::size_t n = default_vector_size / 2; - std::size_t up = n / 10; - std::size_t down = n / 10; - std::size_t width = 8; - assert(dr::mp::use_sycl()); - assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - dr::views::csr_matrix_view local_data; - local_data = dr::generate_band_csr(n, up, down); + auto local_data = getMatrix(); mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, dr::mp::csr_row_distribution> m(local_data, 0); + auto n = m.shape()[1]; + auto width = getWidth(); std::vector base_a(n * width); for (int j = 0; j < width; j++) { for (int i = 0; i < n; i++) { @@ -206,14 +212,7 @@ DR_BENCHMARK(GemvRow_DR); static void Gemv_Reference(benchmark::State &state) { - std::size_t n = default_vector_size / 2; - std::size_t up = n / 10; - std::size_t down = n / 10; - std::size_t width = 8; - assert(dr::mp::use_sycl()); - assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - dr::views::csr_matrix_view local_data; - local_data = dr::generate_band_csr(n, up, down); + auto local_data = getMatrix(); auto nnz_count = local_data.size(); auto band_shape = local_data.shape(); auto q = get_queue(); @@ -222,18 +221,20 @@ static void Gemv_Reference(benchmark::State &state) { auto col_ptr = sycl::malloc_device(nnz_count, q); auto row_ptr = sycl::malloc_device((band_shape[0] + 1), q); std::vector b; + auto width = getWidth(); for (auto i = 0; i < band_shape[1] * width; i++) { b.push_back(i); } - double* elems = new double[band_shape[0]]; + double* elems = new double[band_shape[0] * width]; auto input = sycl::malloc_device(band_shape[1] * width, q); - auto output = sycl::malloc_device(band_shape[0], q); + auto output = sycl::malloc_device(band_shape[0] * width, q); // for (int i = 0; i < band_shape[0]; i++) { // fmt::print("{} {}\n", i, local_data.rowptr_data()[i]); // } q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)).wait(); q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); q.memcpy(row_ptr, local_data.rowptr_data(), (band_shape[0] + 1) * sizeof(long)).wait(); + q.fill(output, 0, band_shape[0] * width); // std::copy(policy, local_data.values_data(), local_data.values_data() + nnz_count, val_ptr); // std::copy(policy, local_data.colind_data(), local_data.colind_data() + nnz_count, col_ptr); // std::copy(policy, local_data.rowptr_data(), local_data.rowptr_data() + band_shape[0], row_ptr); @@ -243,23 +244,57 @@ static void Gemv_Reference(benchmark::State &state) { // fmt::print("{} {}\n", i, local_data.rowptr_data()[i + 1] - local_data.rowptr_data()[i]); // } + auto wg = 32; + while (width * band_shape[0] * wg > INT_MAX) { + wg /= 2; + } + assert(wg > 0); for (auto _ : state) { - dr::__detail::parallel_for_workaround(q, sycl::range<1>{static_cast(band_shape[0])}, [=](auto idx) { - for (auto j = 0; j < width; j++) { - double sum = 0; - auto start = row_ptr[idx]; - auto end = row_ptr[idx + 1]; - for (auto i = start; i < end; i++) { - auto colNum = col_ptr[i]; - auto vectorVal = input[colNum + j * band_shape[1]]; - auto matrixVal = val_ptr[i]; - sum += matrixVal * vectorVal; - } - *(output + idx) = sum; + if (dr::mp::use_sycl()) { + dr::mp::sycl_queue().submit([&](auto &&h) { + h.parallel_for(sycl::nd_range<1>(width * band_shape[0] * wg, wg), [=](auto item) { + auto input_j = item.get_group(0) / band_shape[0]; + auto idx = item.get_group(0) % band_shape[0]; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); + double sum = 0; + auto start = row_ptr[idx]; + auto end = row_ptr[idx + 1]; + for (auto i = start + local_id; i < end; i += group_size) { + auto colNum = col_ptr[i]; + auto vectorVal = input[colNum + input_j * band_shape[1]]; + auto matrixVal = val_ptr[i]; + sum += matrixVal * vectorVal; } - }).wait(); - q.memcpy(elems, output, band_shape[0] * sizeof(double)).wait(); + sycl::atomic_ref + c_ref(output[idx + band_shape[0] * input_j]); + c_ref += sum; + }); + }).wait(); + q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait(); + } + else { + std::fill(elems, elems + band_shape[0] * width, 0); + auto local_rows = local_data.rowptr_data(); + auto row_i = 0; + auto current_row_position = local_rows[1]; + + for (int i = 0; i < nnz_count; i++) { + while (row_i + 1 < band_shape[0] && i >= current_row_position) { + row_i++; + current_row_position = local_rows[row_i + 1]; + } + for (auto j = 0; j < width; j++) { + auto item_id = row_i + j * band_shape[0]; + auto val_index = local_data.colind_data()[i] + j * band_shape[0]; + auto value = b[val_index]; + auto matrix_value = local_data.values_data()[i]; + elems[item_id] += matrix_value * value; + } + } + } } delete[] elems; sycl::free(val_ptr, q); diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp index 738cc448a4..3000f171fc 100644 --- a/examples/mp/sparse_matrix_matrix_mul.cpp +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -41,7 +41,7 @@ int main(int argc, char **argv) { m_row(local_data, root); fmt::print("{}\n", m.size()); - auto width = 8; + auto width = 6; std::vector res(m.shape().first * width); std::vector res_row(m.shape().first * width); std::vector base_a(m.shape().second * width); @@ -58,19 +58,19 @@ int main(int argc, char **argv) { double total_time = 0; auto N = 1; gemv(0, res, m, allocated_a); // it is here to prepare sycl for work - for (int i = 0; i < N; i++) { + for (int i = 0; i < 100; i++) { auto begin = std::chrono::high_resolution_clock::now(); gemv(0, res, m, allocated_a); auto end = std::chrono::high_resolution_clock::now(); double duration = std::chrono::duration(end - begin).count(); total_time += duration; - if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) { - fmt::print("eq canary {}\n", duration * 1000); + if (root == dr::mp::default_comm().rank()) { + fmt::print("eq canary {}\n\n", duration * 1000); } } - if (root == dr::mp::default_comm().rank()) { - fmt::print("eq gemv time total {}\n", total_time * 1000 / N); - } + // if (root == dr::mp::default_comm().rank()) { + // fmt::print("eq gemv time total {}\n", total_time * 1000 / N); + // } m.fence(); total_time = 0; gemv(0, res_row, m_row, allocated_a); diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index e36347b258..21271dc4a1 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -149,7 +149,13 @@ class csr_eq_distribution { std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); } + // auto begin = std::chrono::high_resolution_clock::now(); local_gemv(res_alloc, vals, vals_width); + // auto end = std::chrono::high_resolution_clock::now(); + // double duration = std::chrono::duration(end - begin).count(); + // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); + // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); + gather_gemv_vector(root, res, res_alloc, vals_width); fence(); alloc.deallocate(res_alloc, max_row_size_ * vals_width); From 2beec184c3f3a758ad2ee0fae8a9e7049604aa1f Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 4 Nov 2024 02:38:59 -0800 Subject: [PATCH 32/68] Improved communication in eq distribution --- include/dr/detail/communicator.hpp | 4 +- .../matrix_formats/csr_eq_distribution.hpp | 52 +++++++++++++------ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 1688ae31ca..b9ec0af9f5 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -106,9 +106,9 @@ class communicator { i_all_gather(&src, rng::data(dst), 1, req); } - void gatherv(const void *src, int *counts, int *offsets, void *dst, + void gatherv(const void *src, long long *counts, long *offsets, void *dst, std::size_t root) const { - MPI_Gatherv(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, + MPI_Gatherv_c(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, root, mpi_comm_); } diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 21271dc4a1..d6286f7b1d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -55,7 +55,7 @@ class csr_eq_distribution { } auto vals_len = shape_[1]; auto size = row_sizes_[rank]; - auto res_col_len = max_row_size_; + auto res_col_len = row_sizes_[default_comm().rank()]; if (dr::mp::use_sycl()) { auto localVals = dr::__detail::direct_iterator( dr::mp::local_segment(*vals_data_).begin()); @@ -65,7 +65,7 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - auto division = std::max(real_segment_size / 100, max_row_size_ * 10); + auto division = std::max(real_segment_size / 100, row_sizes_[default_comm().rank()] * 10); auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; @@ -141,12 +141,12 @@ class csr_eq_distribution { auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t vals_width) const { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate(max_row_size_ * vals_width); + auto res_alloc = alloc.allocate( row_sizes_[default_comm().rank()] * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, max_row_size_ * vals_width).wait(); + sycl_queue().fill(res_alloc, 0, row_sizes_[default_comm().rank()] * vals_width).wait(); } else { - std::fill(res_alloc, res_alloc + max_row_size_ * vals_width, 0); + std::fill(res_alloc, res_alloc + row_sizes_[default_comm().rank()] * vals_width, 0); } // auto begin = std::chrono::high_resolution_clock::now(); @@ -156,9 +156,14 @@ class csr_eq_distribution { // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); + // begin = std::chrono::high_resolution_clock::now(); gather_gemv_vector(root, res, res_alloc, vals_width); + // end = std::chrono::high_resolution_clock::now(); + // duration = std::chrono::duration(end - begin).count(); + // size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); + // fmt::print("rows gather time {} {} {}\n", duration * 1000, size, default_comm().rank()); fence(); - alloc.deallocate(res_alloc, max_row_size_ * vals_width); + alloc.deallocate(res_alloc, row_sizes_[default_comm().rank()] * vals_width); } private: @@ -168,14 +173,25 @@ class csr_eq_distribution { void gather_gemv_vector(std::size_t root, C &res, A &partial_res, std::size_t vals_width) const { auto communicator = default_comm(); __detail::allocator alloc; + long long* counts = new long long[communicator.size()]; + for (auto i = 0; i < communicator.size(); i++) { + counts[i] = row_sizes_[i] * sizeof(T) * vals_width; + } + if (communicator.rank() == root) { - auto gathered_res = alloc.allocate(max_row_size_ * communicator.size() * vals_width); - communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); + long* offsets = new long[communicator.size()]; + offsets[0] = 0; + for (auto i = 0; i < communicator.size() - 1; i++) { + offsets[i + 1] = offsets[i] + counts[i]; + } + auto gathered_res = alloc.allocate(max_row_size_ * vals_width); + communicator.gatherv(partial_res, counts, offsets, gathered_res, root); + // communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); T* gathered_res_host; if (use_sycl()) { - gathered_res_host = new T[max_row_size_ * communicator.size() * vals_width]; - __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * communicator.size() * vals_width); + gathered_res_host = new T[max_row_size_ * vals_width]; + __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * vals_width); } else { gathered_res_host = gathered_res; @@ -185,12 +201,15 @@ class csr_eq_distribution { // auto begin = std::chrono::high_resolution_clock::now(); for (auto k = 0; k < vals_width; k++) { + auto current_offset = 0; for (auto i = 0; i < communicator.size(); i++) { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; + auto row_size = row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[1]] += gathered_res_host[vals_width * max_row_size_ * i + k * max_row_size_ + j - first_row]; + res[j + k * shape_[1]] += gathered_res_host[vals_width * current_offset + k * row_size + j - first_row]; } + current_offset += row_sizes_[i]; } } @@ -200,11 +219,14 @@ class csr_eq_distribution { if (use_sycl()) { delete[] gathered_res_host; } - alloc.deallocate(gathered_res, max_row_size_ * communicator.size() * vals_width); + delete[] offsets; + alloc.deallocate(gathered_res, max_row_size_ * vals_width); } else { - communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, - root); + // communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, + // root); + communicator.gatherv(partial_res, counts, nullptr, nullptr, root); } + delete[] counts; } std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; } @@ -260,7 +282,7 @@ class csr_eq_distribution { row_sizes_.push_back(higher_limit - lower_limit); row_information[i] = lower_limit; row_information[default_comm().size() + i] = higher_limit - lower_limit; - max_row_size_ = std::max(max_row_size_, row_sizes_.back()); + max_row_size_ = max_row_size_ + row_sizes_.back(); } row_information[default_comm().size() * 2] = max_row_size_; default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size, From 5edd0ba23b01f0ba0600a9707e72ede2a1aa2966 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 5 Nov 2024 02:33:21 -0800 Subject: [PATCH 33/68] Improve equ format on very sparse matrices --- benchmarks/gbench/mp/gemv.cpp | 23 ++++++++++--------- .../matrix_formats/csr_eq_distribution.hpp | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 232d89c2de..eb1910a85c 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -136,20 +136,21 @@ mp::distributed_sparse_matrix< namespace { std::size_t getWidth() { - // return 8; - return default_vector_size / 100000; + return 8;//default_vector_size / 100000; } } static auto getMatrix() { - // std::size_t n = std::sqrt(default_vector_size / 100000) * 50000; - // // std::size_t n = default_vector_size / 2; - // std::size_t up = n / 10; - // std::size_t down = n / 10; - // // assert(dr::mp::use_sycl()); - // // assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - // return dr::generate_band_csr(n, up, down); - - return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); + std::size_t n = std::sqrt(default_vector_size / 100000) * 50000; + // std::size_t n = default_vector_size / 2; + std::size_t up = n / 50; + std::size_t down = n / 50; + // assert(dr::mp::use_sycl()); + // assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); + return dr::generate_band_csr(n, up, down); + + // return dr::read_csr("/home/komarmik/examples/soc-LiveJournal1.mtx"); + // return dr::read_csr("/home/komarmik/examples/mycielskian18.mtx"); + // return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); } static void GemvEq_DR(benchmark::State &state) { diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index d6286f7b1d..abf239926d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -65,7 +65,7 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - auto division = std::max(real_segment_size / 100, row_sizes_[default_comm().rank()] * 10); + auto division = real_segment_size / 50; auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; From cae67efcd0c0e46c30be23344841142621d3fe69 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 5 Nov 2024 12:55:38 +0100 Subject: [PATCH 34/68] Fix test compilation --- test/gtest/sp/gemv.cpp | 8 ++++---- test/gtest/sp/sparse.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/gtest/sp/gemv.cpp b/test/gtest/sp/gemv.cpp index 2b207be926..1b3e0bbf64 100644 --- a/test/gtest/sp/gemv.cpp +++ b/test/gtest/sp/gemv.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #include "xp-tests.hpp" - +#include "dr/detail/coo_matrix.hpp" TEST(SparseMatrix, Gemv) { std::size_t m = 100; std::size_t k = 100; @@ -40,8 +40,8 @@ TEST(SparseMatrix, EmptyGemv) { using T = float; using I = int; - dr::sp::__detail::coo_matrix base; - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), + dr::__detail::coo_matrix base; + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); @@ -73,7 +73,7 @@ TEST(SparseMatrix, ZeroVector) { } } - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); diff --git a/test/gtest/sp/sparse.cpp b/test/gtest/sp/sparse.cpp index 2e30fee2a0..b6fb93e7a5 100644 --- a/test/gtest/sp/sparse.cpp +++ b/test/gtest/sp/sparse.cpp @@ -16,7 +16,7 @@ TEST(SparseMatrix, IterationForward) { } std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); @@ -48,7 +48,7 @@ TEST(SparseMatrix, IterationReverse) { } std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); - auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(), + auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); From 28519e062e854cd060b916368a1643a0303498e4 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 5 Nov 2024 16:55:24 +0100 Subject: [PATCH 35/68] Reformat changes in mp matrix #1 --- examples/sp/CMakeLists.txt | 2 +- examples/sp/gemv_benchmark.cpp | 101 +++++++++++++ format_res.py | 140 ------------------ ...cal_csr_matrix.hpp => csr_matrix_base.hpp} | 16 +- ...nerate_random_csr.hpp => generate_csr.hpp} | 0 include/dr/detail/matrix_io.hpp | 12 +- include/dr/mp.hpp | 2 +- include/dr/sp/containers/sparse_matrix.hpp | 2 +- run_benchmarks.sh | 32 ---- 9 files changed, 118 insertions(+), 189 deletions(-) delete mode 100644 format_res.py rename include/dr/detail/{local_csr_matrix.hpp => csr_matrix_base.hpp} (84%) rename include/dr/detail/{generate_random_csr.hpp => generate_csr.hpp} (100%) delete mode 100755 run_benchmarks.sh diff --git a/examples/sp/CMakeLists.txt b/examples/sp/CMakeLists.txt index ce82e75626..c078cbf5e8 100644 --- a/examples/sp/CMakeLists.txt +++ b/examples/sp/CMakeLists.txt @@ -18,7 +18,7 @@ add_sp_example(sort) add_sp_example_no_test(sort_benchmark) add_sp_example(inclusive_scan_example) add_sp_example(exclusive_scan_example) -add_sp_example(gemv_benchmark) +# unsatisfied dependency of grb/grb.hpp add_sp_example(gemv_benchmark) add_sp_example_no_test(dot_product_benchmark) add_sp_example_no_test(inclusive_scan_benchmark) add_sp_example_no_test(exclusive_scan_benchmark) diff --git a/examples/sp/gemv_benchmark.cpp b/examples/sp/gemv_benchmark.cpp index 850f2f87c0..62b83f6b86 100644 --- a/examples/sp/gemv_benchmark.cpp +++ b/examples/sp/gemv_benchmark.cpp @@ -7,11 +7,25 @@ #include // FIXME: what is grb.hpp? add it to cmake or remove this dependency +#include #include namespace sp = dr::sp; +template auto local_gemv(M &&a) { + using T = grb::matrix_scalar_t; + std::vector b(a.shape()[1], 1); + std::vector c(a.shape()[0], 0); + + for (auto &&[index, v] : a) { + auto &&[i, k] = index; + c[i] += v * b[k]; + } + + return c; +} + template bool is_equal(T &&x, U &&y) { return x == y; } template @@ -64,6 +78,8 @@ int main(int argc, char **argv) { fmt::print("Square {} x {}\n", a_square.grid_shape()[0], a_square.grid_shape()[1]); + auto c_local = local_gemv(grb::matrix(fname)); + std::size_t m = a.shape()[0]; std::size_t k = a.shape()[1]; @@ -89,6 +105,13 @@ int main(int argc, char **argv) { fmt::print("Copying...\n"); std::vector l(c.size()); dr::sp::copy(c.begin(), c.end(), l.begin()); + fmt::print("Verifying...\n"); + for (std::size_t i = 0; i < l.size(); i++) { + if (!is_equal(l[i], c_local[i])) { + fmt::print("{} != {}\n", l[i], c_local[i]); + } + } + assert(is_equal(c_local, l)); fmt::print("Benchmarking...\n"); for (std::size_t i = 0; i < n_iterations; i++) { @@ -124,6 +147,13 @@ int main(int argc, char **argv) { sp::gemv_square(c, a_square, b); std::vector l(c.size()); sp::copy(c.begin(), c.end(), l.begin()); + for (std::size_t i = 0; i < l.size(); i++) { + if (!is_equal(l[i], c_local[i])) { + // fmt::print("{} != {}\n", l[i], c_local[i]); + } + } + assert(is_equal(c_local, l)); + for (std::size_t i = 0; i < n_iterations; i++) { auto begin = std::chrono::high_resolution_clock::now(); sp::gemv_square(c, a_square, b); @@ -159,6 +189,12 @@ int main(int argc, char **argv) { sp::gemv_square_copy(c, a_square, b); std::vector l(c.size()); sp::copy(c.begin(), c.end(), l.begin()); + for (std::size_t i = 0; i < l.size(); i++) { + if (!is_equal(l[i], c_local[i])) { + fmt::print("{} != {}\n", l[i], c_local[i]); + } + } + assert(is_equal(c_local, l)); for (std::size_t i = 0; i < n_iterations; i++) { auto begin = std::chrono::high_resolution_clock::now(); @@ -189,6 +225,71 @@ int main(int argc, char **argv) { durations.clear(); } + { + auto m = sp::__detail::mmread(fname); + auto shape = m.shape(); + auto nnz = m.size(); + + auto local_mat = + sp::__detail::convert_to_csr(m, shape, nnz, std::allocator{}); + + sycl::queue q(sp::context(), sp::devices()[0]); + + T *values = sycl::malloc_device(nnz, q); + I *colind = sycl::malloc_device(nnz, q); + I *rowptr = sycl::malloc_device(local_mat.shape()[0] + 1, q); + + q.memcpy(values, local_mat.values_data(), sizeof(T) * nnz).wait(); + q.memcpy(colind, local_mat.colind_data(), sizeof(T) * nnz).wait(); + q.memcpy(rowptr, local_mat.rowptr_data(), + sizeof(T) * (local_mat.shape()[0] + 1)) + .wait(); + + sp::device_allocator allocator(q); + + sp::vector> x(local_mat.shape()[1], 1, + allocator); + sp::vector> y(local_mat.shape()[1], 0, + allocator); + + sp::__detail::destroy_csr_matrix_view(local_mat, std::allocator{}); + + sp::csr_matrix_view a_view(values, rowptr, colind, shape, nnz, 0); + + auto e = sp::__detail::local_gemv(q, a_view, x.data().get_raw_pointer(), + y.data().get_raw_pointer()); + e.wait(); + + for (std::size_t i = 0; i < n_iterations; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + auto e = sp::__detail::local_gemv(q, a_view, x.data().get_raw_pointer(), + y.data().get_raw_pointer()); + e.wait(); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count(); + durations.push_back(duration); + } + + fmt::print("Durations: {}\n", + durations | + rng::views::transform([](auto &&x) { return x * 1000; })); + + std::sort(durations.begin(), durations.end()); + + double median_duration = durations[durations.size() / 2]; + + std::cout << "Single GPU: " << median_duration * 1000 << " ms" << std::endl; + + std::size_t n_bytes = sizeof(T) * a.size() + + sizeof(I) * (a.size() + a.shape()[0] + 1) // size of A + + sizeof(T) * b.size() // size of B + + sizeof(T) * c.size(); // size of C + double n_gbytes = n_bytes * 1e-9; + fmt::print("{} GB/s\n", n_gbytes / median_duration); + + durations.clear(); + } + fmt::print("Finalize...\n"); sp::finalize(); diff --git a/format_res.py b/format_res.py deleted file mode 100644 index 4f7deed096..0000000000 --- a/format_res.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import re -import numpy as np -from matplotlib import pyplot as plt -from functools import cmp_to_key - -rootdir = "./dest/" -res_dir = "./res/" -rand_regex = re.compile('mp_band_.+_.+_.+_.+\\.csv') -file_regex = re.compile('mp_.+_.+_.+\\.csv') - -strong_data = {} -weak_data = {} -for root, dirs, files in os.walk(rootdir): - for file in files: - entry_count = -1 - mpi_size = -1 - name = "" - if rand_regex.match(file): - tmp = file[0:-4] - res = tmp.split("_") - entry_count = res[-3] - mpi_size = res[-4] - name = "band" - elif (file_regex.match(file)): - tmp = file[0:-4] - res = tmp.split("_") - entry_count = res[-1] - mpi_size = res[-2] - name = res[-3] - if entry_count != -1: - with open(rootdir + file) as handle: - eq_res = handle.readline().split(",") - row_res = handle.readline().split(",") - eq_arr = np.array(eq_res).astype(np.float64) - row_arr = np.array(row_res).astype(np.float64) - ratio = round(float(entry_count) / float(mpi_size)) - if (ratio not in weak_data): - weak_data[ratio] = [] - if ((name, entry_count) not in strong_data): - strong_data[(name, entry_count)] = [] - strong_data[(name, entry_count)].append((mpi_size, eq_arr, row_arr)) - weak_data[ratio].append((mpi_size, entry_count, eq_arr, row_arr)) - -for entry in strong_data.items(): - if (len(entry[1]) == 1): - continue - - sorted_list = entry[1] - sorted_list = sorted(sorted_list, key=cmp_to_key(lambda x, y: int(x[0]) - int(y[0]))) - - base_eq = np.mean(sorted_list[0][1]) / int(sorted_list[0][0]) - base_row = np.mean(sorted_list[0][2]) / int(sorted_list[0][0]) - - index = [] - means_eq = [] - variance_eq = [] - means_row = [] - variance_row = [] - for info in sorted_list: - index.append(int(info[0])) - speedup_eq = base_eq / info[1] - speedup_row = base_row / info[2] - means_eq.append(np.mean(speedup_eq)) - variance_eq.append(np.var(speedup_eq)) - means_row.append(np.mean(speedup_row)) - variance_row.append(np.var(speedup_row)) - index = np.array(index) - means_eq = np.array(means_eq) - variance_eq = np.array(variance_eq) - means_row = np.array(means_row) - variance_row = np.array(variance_row) - fig, ax = plt.subplots() - - ax.fill_between(index, means_eq - variance_eq, means_eq + variance_eq, alpha=.5, linewidth=0) - ax.plot(index, means_eq, linewidth=2) - ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), - ylim=(0, 20), yticks=np.arange(20)) - ax.plot(np.arange(20), np.arange(20)) - plt.savefig("res/" + entry[0][0] + "_" + entry[0][1] + "_eq_strong") - - fig, ax = plt.subplots() - - ax.fill_between(index, means_row - variance_row, means_row + variance_row, alpha=.5, linewidth=0) - ax.plot(index, means_row, linewidth=2) - ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), - ylim=(0, 20), yticks=np.arange(20)) - ax.plot(np.arange(20), np.arange(20)) - - plt.savefig("res/" + entry[0][0] + "_" + entry[0][1] + "_row_strong") - - -for entry in weak_data.items(): - if (len(entry[1]) == 1): - continue - start = next(filter(lambda x: x[0] == '1', entry[1]), None) - if (start == None): - continue - base_eq = np.mean(start[2]) - base_row = np.mean(start[3]) - - sorted_list = entry[1] - sorted_list = sorted(sorted_list, key=cmp_to_key(lambda x, y: int(x[0]) - int(y[0]))) - - index = [] - means_eq = [] - variance_eq = [] - means_row = [] - variance_row = [] - for info in sorted_list: - index.append(int(info[0])) - speedup_eq = base_eq / info[2] - speedup_row = base_row / info[3] - means_eq.append(np.mean(speedup_eq)) - variance_eq.append(np.var(speedup_eq)) - means_row.append(np.mean(speedup_row)) - variance_row.append(np.var(speedup_row)) - index = np.array(index) - means_eq = np.array(means_eq) - variance_eq = np.array(variance_eq) - means_row = np.array(means_row) - variance_row = np.array(variance_row) - fig, ax = plt.subplots() - - ax.fill_between(index, means_eq - variance_eq, means_eq + variance_eq, alpha=.5, linewidth=0) - ax.plot(index, means_eq, linewidth=2) - ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), - ylim=(0, 1.1), yticks=np.arange(0, 1, 0.2)) - ax.plot(np.arange(20), np.zeros(20) + 1) - plt.savefig("res/" + str(entry[0]) + "_ratio_eq_weak") - - fig, ax = plt.subplots() - - ax.fill_between(index, means_row - variance_row, means_row + variance_row, alpha=.5, linewidth=0) - ax.plot(index, means_row, linewidth=2) - ax.set(xlim=(0, 20), xticks=np.arange(1, 20, 2), - ylim=(0, 1.1), yticks=np.arange(0, 1, 0.2)) - ax.plot(np.arange(20), np.zeros(20) + 1) - plt.savefig("res/" + str(entry[0]) + "_ratio_row_weak") - diff --git a/include/dr/detail/local_csr_matrix.hpp b/include/dr/detail/csr_matrix_base.hpp similarity index 84% rename from include/dr/detail/local_csr_matrix.hpp rename to include/dr/detail/csr_matrix_base.hpp index 3eb44b6dca..0207a95428 100644 --- a/include/dr/detail/local_csr_matrix.hpp +++ b/include/dr/detail/csr_matrix_base.hpp @@ -14,7 +14,7 @@ namespace dr { namespace __detail { template > -class local_csr_matrix { +class csr_matrix_base { public: using value_type = std::pair; using scalar_type = T; @@ -37,7 +37,7 @@ class local_csr_matrix { using iterator = typename backend_type::iterator; using const_iterator = typename backend_type::const_iterator; - local_csr_matrix(dr::index shape, std::size_t nnz) : shape_(shape) { + csr_matrix_base(dr::index shape, std::size_t nnz) : shape_(shape) { auto average_size = nnz / shape.first / 2; for (std::size_t i = 0; i < shape.first; i++) { tuples_.push_back(row_type()); @@ -77,12 +77,12 @@ class local_csr_matrix { } } - local_csr_matrix() = default; - ~local_csr_matrix() = default; - local_csr_matrix(const local_csr_matrix &) = default; - local_csr_matrix(local_csr_matrix &&) = default; - local_csr_matrix &operator=(const local_csr_matrix &) = default; - local_csr_matrix &operator=(local_csr_matrix &&) = default; + csr_matrix_base() = default; + ~csr_matrix_base() = default; + csr_matrix_base(const csr_matrix_base &) = default; + csr_matrix_base(csr_matrix_base &&) = default; + csr_matrix_base &operator=(const csr_matrix_base &) = default; + csr_matrix_base &operator=(csr_matrix_base &&) = default; private: std::size_t size_ = 0; diff --git a/include/dr/detail/generate_random_csr.hpp b/include/dr/detail/generate_csr.hpp similarity index 100% rename from include/dr/detail/generate_random_csr.hpp rename to include/dr/detail/generate_csr.hpp diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index 8dedfc44b2..b86909c18f 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include namespace dr { @@ -72,7 +72,7 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, } template -auto convert_local_csr_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, +auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, Allocator &&allocator) { auto &&[v, j] = *csr_matrix.begin()->begin(); @@ -117,7 +117,7 @@ auto convert_local_csr_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_ /// Read in the Matrix Market file at location `file_path` and a return /// a coo_matrix data structure with its contents. template -inline local_csr_matrix read_coo_matrix(std::string file_path, +inline csr_matrix_base read_csr_matrix_base(std::string file_path, bool one_indexed = true) { using size_type = std::size_t; @@ -190,7 +190,7 @@ inline local_csr_matrix read_coo_matrix(std::string file_path, // NOTE for symmetric matrices: `nnz` holds the number of stored values in // the matrix market file, while `matrix.nnz_` will hold the total number of // stored values (including "mirrored" symmetric values). - local_csr_matrix matrix({m, n}, nnz); + csr_matrix_base matrix({m, n}, nnz); size_type c = 0; while (std::getline(f, buf)) { @@ -246,10 +246,10 @@ void destroy_csr_matrix_view(dr::views::csr_matrix_view view, template auto read_csr(std::string file_path, bool one_indexed = true) { - auto m = __detail::read_coo_matrix(file_path, one_indexed); + auto m = __detail::read_csr_matrix_base(file_path, one_indexed); auto shape = m.shape(); auto nnz = m.size(); - auto t = __detail::convert_local_csr_to_csr(m, shape, nnz, std::allocator{}); + auto t = __detail::convert_csr_base_to_csr(m, shape, nnz, std::allocator{}); return t; } diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index fa469c248e..52c0fb8e5a 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/dr/sp/containers/sparse_matrix.hpp b/include/dr/sp/containers/sparse_matrix.hpp index cc8e689da5..83adb869c0 100644 --- a/include/dr/sp/containers/sparse_matrix.hpp +++ b/include/dr/sp/containers/sparse_matrix.hpp @@ -4,7 +4,7 @@ #pragma once -#include +#include #include #include #include diff --git a/run_benchmarks.sh b/run_benchmarks.sh deleted file mode 100755 index d6673a5b76..0000000000 --- a/run_benchmarks.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -entry=$1 -# for i in {0..9}; do -# echo "processing $i random" -# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ 10000 $((10000 * (1 + 2 * $i))) 0.01 -# done - -# for i in {1..8}; do -# echo "processing $i bench weak" -# mpirun -n $i ./build/examples/mp/sparse_benchmark ./dest/ $(($i * 50000)) 4000 0 -# done - -for i in {1..8}; do - echo "processing $i bench strong" - mpirun -n $i ./build/examples/mp/sparse_benchmark ./dest/ 100000 10000 0 -done - - -# for i in {0..9}; do -# echo "processing $i bench weak" -# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ $(((1 + 2 * $i) * 100000)) 2000 0 -# done - -# for i in {0..9}; do -# echo "processing $i bench strong" -# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ 100000 10000 0 -# done - -# for i in {0..9}; do -# echo "processing $i $entry" -# mpirun -n $((1 + 2 * $i)) ./build/examples/mp/sparse_benchmark ./dest/ $entry -# done \ No newline at end of file From f2c2fbee29e8fa98aa593eba3cd8201cf4e1af8c Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 8 Nov 2024 12:00:41 +0100 Subject: [PATCH 36/68] Fix and improve tests --- benchmarks/gbench/mp/gemv.cpp | 12 --- examples/mp/CMakeLists.txt | 11 ++- examples/mp/sparse_matrix.cpp | 33 ++----- examples/mp/sparse_matrix_matrix_mul.cpp | 11 +-- include/dr/detail/communicator.hpp | 7 +- include/dr/mp/algorithms/matrix/gemv.hpp | 47 ++++++---- .../dr/mp/containers/broadcasted_vector.hpp | 4 + .../matrix_formats/csr_eq_distribution.hpp | 4 +- .../matrix_formats/csr_row_distribution.hpp | 8 +- test/gtest/mp/CMakeLists.txt | 2 + test/gtest/mp/gemv.cpp | 92 +++++++++++++++++++ test/gtest/mp/sparse_matrix.cpp | 35 +++++++ 12 files changed, 191 insertions(+), 75 deletions(-) create mode 100644 test/gtest/mp/gemv.cpp create mode 100644 test/gtest/mp/sparse_matrix.cpp diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index eb1910a85c..c9b4a063b6 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -51,7 +51,6 @@ int main(int argc, char **argv) { auto n = std::stoul(argv[2]); auto up = std::stoul(argv[3]); auto down = std::stoul(argv[4]); - // local_data = dr::generate_random_csr({n, m}, density, 42); local_data = dr::generate_band_csr(n, up, down); filenamestream << "mp_band_" << computeSize << "_" << n << "_" << up + down << "_" << local_data.size(); fmt::print("finished loading\n"); @@ -182,7 +181,6 @@ mp::distributed_sparse_matrix< DR_BENCHMARK(GemvEq_DR); static void GemvRow_DR(benchmark::State &state) { - // fft requires usm shared allocation auto local_data = getMatrix(); @@ -229,21 +227,11 @@ static void Gemv_Reference(benchmark::State &state) { double* elems = new double[band_shape[0] * width]; auto input = sycl::malloc_device(band_shape[1] * width, q); auto output = sycl::malloc_device(band_shape[0] * width, q); - // for (int i = 0; i < band_shape[0]; i++) { - // fmt::print("{} {}\n", i, local_data.rowptr_data()[i]); - // } q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)).wait(); q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); q.memcpy(row_ptr, local_data.rowptr_data(), (band_shape[0] + 1) * sizeof(long)).wait(); q.fill(output, 0, band_shape[0] * width); - // std::copy(policy, local_data.values_data(), local_data.values_data() + nnz_count, val_ptr); - // std::copy(policy, local_data.colind_data(), local_data.colind_data() + nnz_count, col_ptr); - // std::copy(policy, local_data.rowptr_data(), local_data.rowptr_data() + band_shape[0], row_ptr); - std::copy(policy, b.begin(), b.end(), input); - // for (int i = 0; i < band_shape[0]; i++) { - // fmt::print("{} {}\n", i, local_data.rowptr_data()[i + 1] - local_data.rowptr_data()[i]); - // } auto wg = 32; while (width * band_shape[0] * wg > INT_MAX) { diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index f9e76928ae..54e9db9bd0 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -22,13 +22,18 @@ function(add_mp_example example_name) add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2) endfunction() +function(add_mp_example_no_test example_name) + add_executable(${example_name} ${example_name}.cpp) + target_link_libraries(${example_name} cxxopts DR::mpi) +endfunction() + add_mp_example(stencil-1d) add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) add_mp_example(hello_world) -add_mp_example(sparse_matrix) -add_mp_example(sparse_benchmark) -add_mp_example(sparse_matrix_matrix_mul) +add_mp_example_no_test(sparse_matrix) +add_mp_example_no_test(sparse_benchmark) +add_mp_example_no_test(sparse_matrix_matrix_mul) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index 123ea9d2d7..ac22b53008 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -23,9 +23,12 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; - // if (root == dr::mp::default_comm().rank()) { - local_data = dr::read_csr(fname); - // } + if (root == dr::mp::default_comm().rank()) { + std::size_t m = 1000; + std::size_t k = 10; + local_data = dr::generate_random_csr({m, k}, 0.1f); + // local_data = dr::read_csr(fname); + } { mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, @@ -35,30 +38,6 @@ int main(int argc, char **argv) { double, long, dr::mp::MpiBackend, dr::mp::csr_row_distribution> m_row(local_data, root); - fmt::print("{}\n", m.size()); - // for (int i = 0; i < dr::mp::default_comm().size(); i++) { - // if (dr::mp::default_comm().rank() == i) { - // auto csr_iter = local_data.begin(); - // int j = 0; - // // fmt::print("{}\n", i); - // for (auto [index, val]: m) { - // auto [m, n] = index; - - // auto [index_csr, val_csr] = *csr_iter; - // auto [m_csr, n_csr] = index_csr; - // auto check = m == m_csr && n_csr == n && val == val_csr; - // if (!check) { - // fmt::print("{} {} {} {} {} {} {}\n", j, m, m_csr, n, n_csr, val, - // val_csr); - // } - // // assert(check); - // csr_iter++; - // j++; - // } - // } - // m.fence(); - // } - std::vector res(m.shape().first); std::vector res_row(m.shape().first); std::vector a(m.shape().second); diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp index 3000f171fc..90f3854cea 100644 --- a/examples/mp/sparse_matrix_matrix_mul.cpp +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -23,13 +23,9 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; - // auto n = 50000; - // std::size_t up = n / 10; - // std::size_t down = n / 10; - // local_data = dr::generate_band_csr(n, up, down); - // if (root == dr::mp::default_comm().rank()) { + if (root == dr::mp::default_comm().rank()) { local_data = dr::read_csr(fname); - // } + } { mp::distributed_sparse_matrix< double, long, dr::mp::MpiBackend, @@ -68,9 +64,6 @@ int main(int argc, char **argv) { fmt::print("eq canary {}\n\n", duration * 1000); } } - // if (root == dr::mp::default_comm().rank()) { - // fmt::print("eq gemv time total {}\n", total_time * 1000 / N); - // } m.fence(); total_time = 0; gemv(0, res_row, m_row, allocated_a); diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index b9ec0af9f5..a830ba09cb 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -62,11 +62,16 @@ class communicator { } template - void gather(const T *src, T *dst, std::size_t count, std::size_t root) const { + void gather_typed(const T *src, T *dst, std::size_t count, std::size_t root) const { MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), MPI_BYTE, root, mpi_comm_); } + void gather(const void *src, void *dst, std::size_t count, + std::size_t root) const { + MPI_Gather_c(src, count, MPI_BYTE, dst, count, MPI_BYTE, root, mpi_comm_); + } + template void gather(const T &src, std::span dst, std::size_t root) const { assert(rng::size(dst) >= size_); diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index a9efb38934..a709bfeaa6 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -23,24 +23,7 @@ void gemv(int root, C &res, assert(a.shape().first == res.size()); assert(a.shape().second == b.size()); } - // copy b to all machines - // auto communicator = default_comm(); - // __detail::allocator alloc; - // auto broadcasted_b = alloc.allocate(a.shape().second); - // if (communicator.rank() == root) { - // rng::copy(b.begin(), b.end(), broadcasted_b); - // } - - // communicator.bcast(broadcasted_b, a.shape().second * sizeof(T), root); a.local_gemv_and_collect(root, res, b.broadcasted_data(), 1); - - // alloc.deallocate(broadcasted_b, a.shape().second); - // a.fence(); - // if (default_comm().rank() == root) { - // for (int i = 0; i < a.shape().first; i++) { - // fmt::print("Result {} {}\n", i, res[i]); - // } - // } } template C, typename Alloc, @@ -54,4 +37,34 @@ void gemv(int root, C &res, a.local_gemv_and_collect(root, res, b.broadcasted_data(), b.width()); } +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(C &res, distributed_sparse_matrix &a, broadcasted_vector b) { + std::vector workspace(res.size()); + gemv(0, workspace, a, b); + auto tmp = new T[res.size()]; + if (default_comm().rank() == 0) { + std::copy(workspace.begin(), workspace.end(), tmp); + } + default_comm().bcast(tmp, sizeof(T) * res.size(), 0); + std::copy(tmp, tmp + res.size(), res.begin()); + delete[] tmp; +} + +template C, typename Alloc, + typename Backend, typename MatDistr> + requires(vector_multiplicable) +void gemv(C &res, distributed_sparse_matrix &a, broadcasted_slim_matrix b) { + std::vector workspace(res.size()); + gemv(0, workspace, a, b); + auto tmp = new T[res.size()]; + if (default_comm().rank() == 0) { + std::copy(workspace.begin(), workspace.end(), tmp); + } + default_comm().bcast(tmp, sizeof(T) * res.size(), 0); + std::copy(tmp, tmp + res.size(), res.begin()); + delete[] tmp; +} + } // namespace dr::mp diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp index abe9c09a12..a06ddf2e92 100644 --- a/include/dr/mp/containers/broadcasted_vector.hpp +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -47,6 +47,10 @@ class broadcasted_vector { auto size() { return _data_size; } + + auto begin() const { return _data; } + auto end() const { return begin() + _data_size; } + private: T* _data = nullptr; std::size_t _data_size = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index abf239926d..d8d7c4c0f0 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -186,7 +186,7 @@ class csr_eq_distribution { } auto gathered_res = alloc.allocate(max_row_size_ * vals_width); communicator.gatherv(partial_res, counts, offsets, gathered_res, root); - // communicator.gather(partial_res, gathered_res, max_row_size_ * vals_width, root); + // communicator.gather_typed(partial_res, gathered_res, max_row_size_ * vals_width, root); T* gathered_res_host; if (use_sycl()) { @@ -222,7 +222,7 @@ class csr_eq_distribution { delete[] offsets; alloc.deallocate(gathered_res, max_row_size_ * vals_width); } else { - // communicator.gather(partial_res, static_cast(nullptr), max_row_size_ * vals_width, + // communicator.gather_typed(partial_res, static_cast(nullptr), max_row_size_ * vals_width, // root); communicator.gatherv(partial_res, counts, nullptr, nullptr, root); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 6525db7780..c6cb46955b 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -164,16 +164,16 @@ class csr_row_distribution { if (communicator.rank() == root) { auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); - communicator.gather(partial_res, scratch, segment_size_ * vals_width, root); + communicator.gather_typed(partial_res, scratch, segment_size_ * vals_width, root); T* temp = nullptr; if (use_sycl()) { temp = new T[res.size()]; } for (auto j = 0; j < communicator.size(); j++) { - if (j * segment_size_ >= shape_.second) { + if (j * segment_size_ >= shape_.first) { break; } - auto comm_segment_size = std::min(segment_size_, shape_.second - j * segment_size_); + auto comm_segment_size = std::min(segment_size_, shape_.first - j * segment_size_); for (auto i = 0; i < vals_width; i++) { auto piece_start = scratch + j * vals_width * segment_size_ + i * segment_size_; @@ -198,7 +198,7 @@ class csr_row_distribution { // } alloc.deallocate(scratch, segment_size_ * communicator.size()* vals_width); } else { - communicator.gather(partial_res, static_cast(nullptr), segment_size_ * vals_width, + communicator.gather_typed(partial_res, static_cast(nullptr), segment_size_ * vals_width, root); } } diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 32f26d120a..f4f65e4776 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -36,6 +36,7 @@ add_executable( communicator.cpp copy.cpp distributed_vector.cpp + gemv.cpp halo.cpp mdstar.cpp mpsort.cpp @@ -43,6 +44,7 @@ add_executable( stencil.cpp segments.cpp slide_view.cpp + sparse_matrix.cpp wave_kernel.cpp) add_executable( diff --git a/test/gtest/mp/gemv.cpp b/test/gtest/mp/gemv.cpp new file mode 100644 index 0000000000..2ca7de5ed4 --- /dev/null +++ b/test/gtest/mp/gemv.cpp @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" +auto testMatrixGemv(std::size_t m, std::size_t k, auto &a) { + std::vector base_b(k, 1.f); + std::vector c(m, 0.f); + + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(k, 0, base_b, dr::mp::default_comm()); + + std::vector res(m, 0.0f); + dr::mp::gemv(c, a, allocated_b); + + std::vector c_ref(m, 0.f); + + for (auto &&[index, v] : a) { + auto &&[i, k] = index; + + c_ref[i] += v; + } + + EXPECT_TRUE(fp_equal(c_ref, c)) + << fmt::format("Reference:\n {}\nActual:\n {}\n", c_ref, c); +} + +TEST(SparseMatrix, GemvRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvRowNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEqNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvRowNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} + +TEST(SparseMatrix, GemvEqNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemv(m, k, a); +} \ No newline at end of file diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp new file mode 100644 index 0000000000..cd6af833c4 --- /dev/null +++ b/test/gtest/mp/sparse_matrix.cpp @@ -0,0 +1,35 @@ +#include "xp-tests.hpp" +auto testMatrixIter(auto& src, auto &matrix) { + EXPECT_TRUE(src.size() == matrix.size()); + auto iterCsr = src.begin(); + auto iterMatrix = matrix.begin(); + std::map, double> entries; + for (auto (index, val): iterCsr) { + entries[index] = val; + } + for (auto (index, val): iterMatrix) { + EXPECT_TRUE(val == entries[index]); + } +} + +TEST(SparseMatrix, IterRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixIter(csr, a); +} + +TEST(SparseMatrix, IterEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixIter(csr, a); +} From 3a00d614b5c17c38b6225d50b30a85add33707a2 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 8 Nov 2024 14:11:54 +0100 Subject: [PATCH 37/68] Add tests for sparse gemm in mp --- examples/mp/sparse_matrix_matrix_mul.cpp | 10 +- .../matrix_formats/csr_eq_distribution.hpp | 3 +- test/gtest/mp/gemv.cpp | 97 ++++++++++++++++++- test/gtest/mp/sparse_matrix.cpp | 49 ++++++++-- 4 files changed, 147 insertions(+), 12 deletions(-) diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp index 90f3854cea..9d6fd1ebb0 100644 --- a/examples/mp/sparse_matrix_matrix_mul.cpp +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -24,7 +24,10 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; if (root == dr::mp::default_comm().rank()) { - local_data = dr::read_csr(fname); + std::size_t m = 10; + std::size_t k = 1000; + local_data = dr::generate_random_csr({m, k}, 0.1f); + // local_data = dr::read_csr(fname); } { mp::distributed_sparse_matrix< @@ -37,7 +40,7 @@ int main(int argc, char **argv) { m_row(local_data, root); fmt::print("{}\n", m.size()); - auto width = 6; + auto width = 3; std::vector res(m.shape().first * width); std::vector res_row(m.shape().first * width); std::vector base_a(m.shape().second * width); @@ -85,12 +88,13 @@ int main(int argc, char **argv) { std::vector ref(m.shape().first * width); auto res_col_len = m.shape().first; + auto in_len = m.shape().second; if (dr::mp::default_comm().rank() == 0) { for (auto a : local_data) { auto [index, val] = a; auto [m, n] = index; for (int i = 0; i < width; i++) { - ref[m + i * res_col_len] += base_a[n + i * res_col_len] * val; + ref[m + i * res_col_len] += base_a[n + i * in_len] * val; } } for (int i = 0; i < m.shape().first * width; i++) { diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index d8d7c4c0f0..817023ea4f 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -200,6 +200,7 @@ class csr_eq_distribution { // auto begin = std::chrono::high_resolution_clock::now(); + for (auto k = 0; k < vals_width; k++) { auto current_offset = 0; for (auto i = 0; i < communicator.size(); i++) { @@ -207,7 +208,7 @@ class csr_eq_distribution { auto last_row = row_offsets_[i] + row_sizes_[i]; auto row_size = row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[1]] += gathered_res_host[vals_width * current_offset + k * row_size + j - first_row]; + res[j + k * shape_[0]] += gathered_res_host[vals_width * current_offset + k * row_size + j - first_row]; } current_offset += row_sizes_[i]; } diff --git a/test/gtest/mp/gemv.cpp b/test/gtest/mp/gemv.cpp index 2ca7de5ed4..3afb96bbe7 100644 --- a/test/gtest/mp/gemv.cpp +++ b/test/gtest/mp/gemv.cpp @@ -10,7 +10,6 @@ auto testMatrixGemv(std::size_t m, std::size_t k, auto &a) { dr::mp::broadcasted_vector allocated_b; allocated_b.broadcast_data(k, 0, base_b, dr::mp::default_comm()); - std::vector res(m, 0.0f); dr::mp::gemv(c, a, allocated_b); std::vector c_ref(m, 0.f); @@ -25,6 +24,33 @@ auto testMatrixGemv(std::size_t m, std::size_t k, auto &a) { << fmt::format("Reference:\n {}\nActual:\n {}\n", c_ref, c); } +auto testMatrixGemm(std::size_t m, std::size_t n, auto &a, std::size_t width) { + std::vector base_b(n * width); + std::vector c(m * width, 0.f); + + for (auto i = 0; i < n * width; i++) { + base_b[i] = i; + } + + dr::mp::broadcasted_slim_matrix allocated_b; + allocated_b.broadcast_data(n, width, 0, base_b, dr::mp::default_comm()); + + dr::mp::gemv(c, a, allocated_b); + + std::vector c_ref(m * width, 0.f); + + for (auto &&[index, v] : a) { + auto &&[i, k] = index; + + for (auto j = 0; j < width; j++) { + c_ref[i + j * m] += v * base_b[k + j * n]; + } + } + + EXPECT_TRUE(fp_equal(c_ref, c)) + << fmt::format("Reference:\n {}\nActual:\n {}\n", c_ref, c); +} + TEST(SparseMatrix, GemvRow) { std::size_t m = 100; std::size_t k = 100; @@ -89,4 +115,71 @@ TEST(SparseMatrix, GemvEqNotSquareDifferentAxis) { dr::mp::csr_eq_distribution> a(csr, 0); testMatrixGemv(m, k, a); -} \ No newline at end of file +} + + +TEST(SparseMatrix, GemmRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmRowNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEqNotSquare) { + std::size_t m = 1000; + std::size_t k = 10; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmRowNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} + +TEST(SparseMatrix, GemmEqNotSquareDifferentAxis) { + std::size_t m = 10; + std::size_t k = 1000; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixGemm(m, k, a, 20); +} diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp index cd6af833c4..92a8517db1 100644 --- a/test/gtest/mp/sparse_matrix.cpp +++ b/test/gtest/mp/sparse_matrix.cpp @@ -1,17 +1,54 @@ #include "xp-tests.hpp" auto testMatrixIter(auto& src, auto &matrix) { EXPECT_TRUE(src.size() == matrix.size()); - auto iterCsr = src.begin(); - auto iterMatrix = matrix.begin(); std::map, double> entries; - for (auto (index, val): iterCsr) { - entries[index] = val; + for (auto [index, val]: src) { + entries[{index.first, index.second}] = val; } - for (auto (index, val): iterMatrix) { - EXPECT_TRUE(val == entries[index]); + for (auto [index, val]: matrix) { + EXPECT_TRUE((val == entries[{index.first, index.second}])); } } +TEST(SparseMatrix, staticAssertEq) { + std::size_t m = 100; + std::size_t k = 100; + using Dist = dr::mp::csr_eq_distribution; + static_assert(dr::mp::matrix_distibution); + static_assert(dr::mp::vector_multiplicable); + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend> + a(csr, 0); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + using Matrix = decltype(a); + static_assert(rng::forward_range); + static_assert(dr::distributed_range); +} + +TEST(SparseMatrix, staticAssertRow) { + std::size_t m = 100; + std::size_t k = 100; + using Dist = dr::mp::csr_row_distribution; + static_assert(dr::mp::matrix_distibution); + static_assert(dr::mp::vector_multiplicable); + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend> + a(csr, 0); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + static_assert(std::forward_iterator); + using Matrix = decltype(a); + static_assert(rng::forward_range); + static_assert(dr::distributed_range); +} + + TEST(SparseMatrix, IterRow) { std::size_t m = 100; std::size_t k = 100; From 2ec4b219e515b78f4a3a5a0f893dafee5f63692e Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 8 Nov 2024 16:21:23 +0100 Subject: [PATCH 38/68] Reformat changes in mp matrix #2 --- benchmarks/gbench/mp/gemv.cpp | 278 +++++++++--------- examples/mp/sparse_benchmark.cpp | 161 +++++----- examples/mp/sparse_matrix.cpp | 14 +- examples/mp/sparse_matrix_matrix_mul.cpp | 22 +- include/dr/detail/communicator.hpp | 5 +- include/dr/detail/generate_csr.hpp | 50 ++-- include/dr/detail/matrix_io.hpp | 33 ++- include/dr/mp.hpp | 2 + include/dr/mp/algorithms/matrix/gemv.hpp | 17 +- .../mp/containers/broadcasted_slim_matrix.hpp | 131 ++++----- .../dr/mp/containers/broadcasted_vector.hpp | 82 +++--- .../containers/distributed_sparse_matrix.hpp | 11 +- .../dr/mp/containers/distributed_vector.hpp | 21 +- .../matrix_formats/csr_eq_distribution.hpp | 160 ++++------ .../matrix_formats/csr_row_distribution.hpp | 157 ++++------ test/gtest/mp/gemv.cpp | 99 +++---- test/gtest/mp/sparse_matrix.cpp | 55 ++-- test/gtest/sp/gemv.cpp | 6 +- test/gtest/sp/sparse.cpp | 4 +- 19 files changed, 628 insertions(+), 680 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index c9b4a063b6..514e21e5ac 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -1,12 +1,15 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause #include "mpi.h" #include "dr/mp.hpp" -#include -#include #include -#include +#include #include +#include +#include #ifdef STANDALONE_BENCHMARK @@ -31,12 +34,15 @@ int main(int argc, char **argv) { MPI_Comm_size(comm, &comm_size); if (argc != 3 && argc != 5) { - fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market file], or ./sparse_benchmark [test outcome dir] [number of rows] [number of columns] [number of lower bands] [number of upper bands]\n"); + fmt::print( + "usage: ./sparse_benchmark [test outcome dir] [matrix market file], or " + "./sparse_benchmark [test outcome dir] [number of rows] [number of " + "columns] [number of lower bands] [number of upper bands]\n"); return 1; } - + #ifdef SYCL_LANGUAGE_VERSION - sycl::queue q = dr::mp::select_queue(); + sycl::queue q = dr::mp::select_queue(); mp::init(q); #else mp::init(); @@ -47,86 +53,88 @@ int main(int argc, char **argv) { auto computeSize = dr::mp::default_comm().size(); if (root == dr::mp::default_comm().rank()) { if (argc == 5) { - fmt::print("started loading\n"); - auto n = std::stoul(argv[2]); - auto up = std::stoul(argv[3]); - auto down = std::stoul(argv[4]); - local_data = dr::generate_band_csr(n, up, down); - filenamestream << "mp_band_" << computeSize << "_" << n << "_" << up + down << "_" << local_data.size(); - fmt::print("finished loading\n"); - } - else { - fmt::print("started loading\n"); - std::string fname(argv[2]); - std::filesystem::path p(argv[2]); - local_data = dr::read_csr(fname); - filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" << local_data.size(); - fmt::print("finished loading\n"); + fmt::print("started loading\n"); + auto n = std::stoul(argv[2]); + auto up = std::stoul(argv[3]); + auto down = std::stoul(argv[4]); + local_data = dr::generate_band_csr(n, up, down); + filenamestream << "mp_band_" << computeSize << "_" << n << "_" + << up + down << "_" << local_data.size(); + fmt::print("finished loading\n"); + } else { + fmt::print("started loading\n"); + std::string fname(argv[2]); + std::filesystem::path p(argv[2]); + local_data = dr::read_csr(fname); + filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" + << local_data.size(); + fmt::print("finished loading\n"); } } std::string resname; -mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - m_eq(local_data, root); -mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - m_row(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m_eq(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); fmt::print("finished distribution\n"); - std::vector eq_duration; - std::vector row_duration; - - auto N = 10; - std::vector b; - b.reserve(m_row.shape().second); - std::vector res(m_row.shape().first); - for (auto i = 0; i < m_row.shape().second; i++) { - b.push_back(i); - } + std::vector eq_duration; + std::vector row_duration; - dr::mp::broadcasted_vector allocated_b; - allocated_b.broadcast_data(m_row.shape().second, 0, b, dr::mp::default_comm()); + auto N = 10; + std::vector b; + b.reserve(m_row.shape().second); + std::vector res(m_row.shape().first); + for (auto i = 0; i < m_row.shape().second; i++) { + b.push_back(i); + } - fmt::print("started initial gemv distribution\n"); - gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m_row.shape().second, 0, b, + dr::mp::default_comm()); - fmt::print("finished initial gemv distribution\n"); - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_eq, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - eq_duration.push_back(duration); - } - - gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_row, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - row_duration.push_back(duration); - } + fmt::print("started initial gemv distribution\n"); + gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work - if (root == dr::mp::default_comm().rank()) { - std::string tmp; - filenamestream >> tmp; - std::filesystem::path p(argv[1]); - p += tmp; - p += ".csv"; - std::ofstream write_stream(p.string()); - write_stream << eq_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << eq_duration[i]; - } - write_stream << "\n"; - write_stream << row_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << row_duration[i]; - } - write_stream << "\n"; + fmt::print("finished initial gemv distribution\n"); + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_eq, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + eq_duration.push_back(duration); + } + + gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_row, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + row_duration.push_back(duration); + } + + if (root == dr::mp::default_comm().rank()) { + std::string tmp; + filenamestream >> tmp; + std::filesystem::path p(argv[1]); + p += tmp; + p += ".csv"; + std::ofstream write_stream(p.string()); + write_stream << eq_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << eq_duration[i]; } + write_stream << "\n"; + write_stream << row_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << row_duration[i]; + } + write_stream << "\n"; + } allocated_b.destroy_data(); mp::finalize(); } @@ -134,10 +142,10 @@ mp::distributed_sparse_matrix< #else namespace { - std::size_t getWidth() { - return 8;//default_vector_size / 100000; - } +std::size_t getWidth() { + return 8; // default_vector_size / 100000; } +} // namespace static auto getMatrix() { std::size_t n = std::sqrt(default_vector_size / 100000) * 50000; // std::size_t n = default_vector_size / 2; @@ -145,28 +153,29 @@ static auto getMatrix() { std::size_t down = n / 50; // assert(dr::mp::use_sycl()); // assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - return dr::generate_band_csr(n, up, down); + return dr::generate_band_csr(n, up, down); - // return dr::read_csr("/home/komarmik/examples/soc-LiveJournal1.mtx"); - // return dr::read_csr("/home/komarmik/examples/mycielskian18.mtx"); - // return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); + // return dr::read_csr("/home/komarmik/examples/soc-LiveJournal1.mtx"); return + // dr::read_csr("/home/komarmik/examples/mycielskian18.mtx"); + // return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); } static void GemvEq_DR(benchmark::State &state) { auto local_data = getMatrix(); - -mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - m(local_data, 0); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m(local_data, 0); auto n = m.shape()[1]; auto width = getWidth(); std::vector base_a(n * width); for (int j = 0; j < width; j++) { - for (int i = 0; i < n; i++) { - base_a[i + j * n] = i*j + 1; - } + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i * j + 1; + } } dr::mp::broadcasted_slim_matrix allocated_a; allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); @@ -183,18 +192,17 @@ DR_BENCHMARK(GemvEq_DR); static void GemvRow_DR(benchmark::State &state) { auto local_data = getMatrix(); - mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - m(local_data, 0); + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m(local_data, 0); auto n = m.shape()[1]; auto width = getWidth(); std::vector base_a(n * width); for (int j = 0; j < width; j++) { - for (int i = 0; i < n; i++) { - base_a[i + j * n] = i*j + 1; - } + for (int i = 0; i < n; i++) { + base_a[i + j * n] = i * j + 1; + } } dr::mp::broadcasted_slim_matrix allocated_a; allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm()); @@ -208,8 +216,6 @@ static void GemvRow_DR(benchmark::State &state) { DR_BENCHMARK(GemvRow_DR); - - static void Gemv_Reference(benchmark::State &state) { auto local_data = getMatrix(); auto nnz_count = local_data.size(); @@ -222,14 +228,17 @@ static void Gemv_Reference(benchmark::State &state) { std::vector b; auto width = getWidth(); for (auto i = 0; i < band_shape[1] * width; i++) { - b.push_back(i); + b.push_back(i); } - double* elems = new double[band_shape[0] * width]; + double *elems = new double[band_shape[0] * width]; auto input = sycl::malloc_device(band_shape[1] * width, q); auto output = sycl::malloc_device(band_shape[0] * width, q); - q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)).wait(); + q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double)) + .wait(); q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait(); - q.memcpy(row_ptr, local_data.rowptr_data(), (band_shape[0] + 1) * sizeof(long)).wait(); + q.memcpy(row_ptr, local_data.rowptr_data(), + (band_shape[0] + 1) * sizeof(long)) + .wait(); q.fill(output, 0, band_shape[0] * width); std::copy(policy, b.begin(), b.end(), input); @@ -238,33 +247,36 @@ static void Gemv_Reference(benchmark::State &state) { wg /= 2; } assert(wg > 0); - + for (auto _ : state) { if (dr::mp::use_sycl()) { - dr::mp::sycl_queue().submit([&](auto &&h) { - h.parallel_for(sycl::nd_range<1>(width * band_shape[0] * wg, wg), [=](auto item) { - auto input_j = item.get_group(0) / band_shape[0]; - auto idx = item.get_group(0) % band_shape[0]; - auto local_id = item.get_local_id(); - auto group_size = item.get_local_range(0); - double sum = 0; - auto start = row_ptr[idx]; - auto end = row_ptr[idx + 1]; - for (auto i = start + local_id; i < end; i += group_size) { - auto colNum = col_ptr[i]; - auto vectorVal = input[colNum + input_j * band_shape[1]]; - auto matrixVal = val_ptr[i]; - sum += matrixVal * vectorVal; - } - sycl::atomic_ref - c_ref(output[idx + band_shape[0] * input_j]); - c_ref += sum; - }); - }).wait(); + dr::mp::sycl_queue() + .submit([&](auto &&h) { + h.parallel_for( + sycl::nd_range<1>(width * band_shape[0] * wg, wg), + [=](auto item) { + auto input_j = item.get_group(0) / band_shape[0]; + auto idx = item.get_group(0) % band_shape[0]; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); + double sum = 0; + auto start = row_ptr[idx]; + auto end = row_ptr[idx + 1]; + for (auto i = start + local_id; i < end; i += group_size) { + auto colNum = col_ptr[i]; + auto vectorVal = input[colNum + input_j * band_shape[1]]; + auto matrixVal = val_ptr[i]; + sum += matrixVal * vectorVal; + } + sycl::atomic_ref + c_ref(output[idx + band_shape[0] * input_j]); + c_ref += sum; + }); + }) + .wait(); q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait(); - } - else { + } else { std::fill(elems, elems + band_shape[0] * width, 0); auto local_rows = local_data.rowptr_data(); auto row_i = 0; @@ -293,16 +305,14 @@ static void Gemv_Reference(benchmark::State &state) { sycl::free(output, q); } -static void GemvEq_Reference(benchmark::State &state) { - Gemv_Reference(state); -} +static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); } static void GemvRow_Reference(benchmark::State &state) { - Gemv_Reference(state); + Gemv_Reference(state); } DR_BENCHMARK(GemvEq_Reference); DR_BENCHMARK(GemvRow_Reference); -#endif \ No newline at end of file +#endif diff --git a/examples/mp/sparse_benchmark.cpp b/examples/mp/sparse_benchmark.cpp index 2e49873619..b53c0ee477 100644 --- a/examples/mp/sparse_benchmark.cpp +++ b/examples/mp/sparse_benchmark.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include -#include #include -#include +#include #include +#include +#include namespace mp = dr::mp; @@ -23,12 +23,14 @@ int main(int argc, char **argv) { MPI_Comm_size(comm, &comm_size); if (argc != 3 && argc != 5) { - fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market file], or ./sparse_benchmark [test outcome dir] [number of rows] [number of columns] [density]\n"); + fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market " + "file], or ./sparse_benchmark [test outcome dir] [number of " + "rows] [number of columns] [density]\n"); return 1; } - + #ifdef SYCL_LANGUAGE_VERSION - sycl::queue q = dr::mp::select_queue(); + sycl::queue q = dr::mp::select_queue(); mp::init(q); #else mp::init(); @@ -39,87 +41,90 @@ int main(int argc, char **argv) { auto computeSize = dr::mp::default_comm().size(); if (root == dr::mp::default_comm().rank()) { if (argc == 5) { - fmt::print("started loading\n"); - auto n = std::stoul(argv[2]); - auto up = std::stoul(argv[3]); - auto down = std::stoul(argv[4]); - // local_data = dr::generate_random_csr({n, m}, density, 42); - local_data = dr::generate_band_csr(n, up, down); - filenamestream << "mp_band_" << computeSize << "_" << n << "_" << up + down << "_" << local_data.size(); - fmt::print("finished loading\n"); - } - else { - fmt::print("started loading\n"); - std::string fname(argv[2]); - std::filesystem::path p(argv[2]); - local_data = dr::read_csr(fname); - filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" << local_data.size(); - fmt::print("finished loading\n"); + fmt::print("started loading\n"); + auto n = std::stoul(argv[2]); + auto up = std::stoul(argv[3]); + auto down = std::stoul(argv[4]); + // local_data = dr::generate_random_csr({n, m}, density, + // 42); + local_data = dr::generate_band_csr(n, up, down); + filenamestream << "mp_band_" << computeSize << "_" << n << "_" + << up + down << "_" << local_data.size(); + fmt::print("finished loading\n"); + } else { + fmt::print("started loading\n"); + std::string fname(argv[2]); + std::filesystem::path p(argv[2]); + local_data = dr::read_csr(fname); + filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" + << local_data.size(); + fmt::print("finished loading\n"); } } std::string resname; -mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - m_eq(local_data, root); -mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - m_row(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + m_eq(local_data, root); + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); fmt::print("finished distribution\n"); - std::vector eq_duration; - std::vector row_duration; + std::vector eq_duration; + std::vector row_duration; - auto N = 10; - std::vector b; - b.reserve(m_row.shape().second); - std::vector res(m_row.shape().first); - for (auto i = 0; i < m_row.shape().second; i++) { - b.push_back(i); - } + auto N = 10; + std::vector b; + b.reserve(m_row.shape().second); + std::vector res(m_row.shape().first); + for (auto i = 0; i < m_row.shape().second; i++) { + b.push_back(i); + } - dr::mp::broadcasted_vector allocated_b; - allocated_b.broadcast_data(m_row.shape().second, 0, b, dr::mp::default_comm()); + dr::mp::broadcasted_vector allocated_b; + allocated_b.broadcast_data(m_row.shape().second, 0, b, + dr::mp::default_comm()); - fmt::print("started initial gemv distribution\n"); - gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work + fmt::print("started initial gemv distribution\n"); + gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work - fmt::print("finished initial gemv distribution\n"); - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_eq, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - eq_duration.push_back(duration); - } - - gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_row, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - row_duration.push_back(duration); - } + fmt::print("finished initial gemv distribution\n"); + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_eq, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + eq_duration.push_back(duration); + } - if (root == dr::mp::default_comm().rank()) { - std::string tmp; - filenamestream >> tmp; - std::filesystem::path p(argv[1]); - p += tmp; - p += ".csv"; - std::ofstream write_stream(p.string()); - write_stream << eq_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << eq_duration[i]; - } - write_stream << "\n"; - write_stream << row_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << row_duration[i]; - } - write_stream << "\n"; + gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work + for (auto i = 0; i < N; i++) { + auto begin = std::chrono::high_resolution_clock::now(); + gemv(0, res, m_row, allocated_b); + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - begin).count() * 1000; + row_duration.push_back(duration); + } + + if (root == dr::mp::default_comm().rank()) { + std::string tmp; + filenamestream >> tmp; + std::filesystem::path p(argv[1]); + p += tmp; + p += ".csv"; + std::ofstream write_stream(p.string()); + write_stream << eq_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << eq_duration[i]; } + write_stream << "\n"; + write_stream << row_duration.front(); + for (auto i = 1; i < N; i++) { + write_stream << "," << row_duration[i]; + } + write_stream << "\n"; + } allocated_b.destroy_data(); mp::finalize(); -} \ No newline at end of file +} diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp index ac22b53008..ae1511280c 100644 --- a/examples/mp/sparse_matrix.cpp +++ b/examples/mp/sparse_matrix.cpp @@ -24,10 +24,7 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; if (root == dr::mp::default_comm().rank()) { - std::size_t m = 1000; - std::size_t k = 10; - local_data = dr::generate_random_csr({m, k}, 0.1f); - // local_data = dr::read_csr(fname); + local_data = dr::read_csr(fname); } { mp::distributed_sparse_matrix< @@ -44,10 +41,10 @@ int main(int argc, char **argv) { for (int i = 0; i < a.size(); i++) { a[i] = i; } - dr::mp::broadcasted_vector allocated_a; - allocated_a.broadcast_data(m_row.shape().second, 0, a, dr::mp::default_comm()); + allocated_a.broadcast_data(m_row.shape().second, 0, a, + dr::mp::default_comm()); m.fence(); double total_time = 0; auto N = 1; @@ -98,11 +95,12 @@ int main(int argc, char **argv) { } for (int i = 0; i < m.shape().first; i++) { if (res_row[i] != ref[i]) { - fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], ref[i]); + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], + ref[i]); } } } - allocated_a.destroy_data(); + allocated_a.destroy_data(); } if (root == dr::mp::default_comm().rank()) { diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp index 9d6fd1ebb0..4fd7c139c8 100644 --- a/examples/mp/sparse_matrix_matrix_mul.cpp +++ b/examples/mp/sparse_matrix_matrix_mul.cpp @@ -24,10 +24,7 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; if (root == dr::mp::default_comm().rank()) { - std::size_t m = 10; - std::size_t k = 1000; - local_data = dr::generate_random_csr({m, k}, 0.1f); - // local_data = dr::read_csr(fname); + local_data = dr::read_csr(fname); } { mp::distributed_sparse_matrix< @@ -45,14 +42,14 @@ int main(int argc, char **argv) { std::vector res_row(m.shape().first * width); std::vector base_a(m.shape().second * width); for (int j = 0; j < width; j++) { - for (int i = 0; i < m.shape().second; i++) { - base_a[i + j * m.shape().second] = i*j + 1; - } + for (int i = 0; i < m.shape().second; i++) { + base_a[i + j * m.shape().second] = i * j + 1; + } } - dr::mp::broadcasted_slim_matrix allocated_a; - allocated_a.broadcast_data(m_row.shape().second, width, 0, base_a, dr::mp::default_comm()); + allocated_a.broadcast_data(m_row.shape().second, width, 0, base_a, + dr::mp::default_comm()); m.fence(); double total_time = 0; auto N = 1; @@ -94,7 +91,7 @@ int main(int argc, char **argv) { auto [index, val] = a; auto [m, n] = index; for (int i = 0; i < width; i++) { - ref[m + i * res_col_len] += base_a[n + i * in_len] * val; + ref[m + i * res_col_len] += base_a[n + i * in_len] * val; } } for (int i = 0; i < m.shape().first * width; i++) { @@ -104,11 +101,12 @@ int main(int argc, char **argv) { } for (int i = 0; i < m.shape().first * width; i++) { if (res_row[i] != ref[i]) { - fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], ref[i]); + fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i], + ref[i]); } } } - allocated_a.destroy_data(); + allocated_a.destroy_data(); } if (root == dr::mp::default_comm().rank()) { diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index a830ba09cb..42bbe9c1a8 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -62,7 +62,8 @@ class communicator { } template - void gather_typed(const T *src, T *dst, std::size_t count, std::size_t root) const { + void gather_typed(const T *src, T *dst, std::size_t count, + std::size_t root) const { MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), MPI_BYTE, root, mpi_comm_); } @@ -114,7 +115,7 @@ class communicator { void gatherv(const void *src, long long *counts, long *offsets, void *dst, std::size_t root) const { MPI_Gatherv_c(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, - root, mpi_comm_); + root, mpi_comm_); } // pointer with explicit tag diff --git a/include/dr/detail/generate_csr.hpp b/include/dr/detail/generate_csr.hpp index 116d3380a9..2917028b6b 100644 --- a/include/dr/detail/generate_csr.hpp +++ b/include/dr/detail/generate_csr.hpp @@ -6,9 +6,9 @@ #include #include -#include -#include #include +#include +#include namespace dr { @@ -26,15 +26,14 @@ template using uniform_distribution_t = typename uniform_distribution::type; struct pair_hash { - template - inline std::size_t operator()(const std::pair & v) const { - return v.first*31+v.second; - } + template + inline std::size_t operator()(const std::pair &v) const { + return v.first * 31 + v.second; + } }; } // namespace - template auto generate_random_csr(dr::index shape, double density = 0.01, unsigned int seed = 0) { @@ -42,7 +41,7 @@ auto generate_random_csr(dr::index shape, double density = 0.01, assert(density >= 0.0 && density < 1.0); std::unordered_set, pair_hash> tuples{}; - std::vector, T>> entries; + std::vector, T>> entries; std::size_t nnz = density * shape[0] * shape[1]; entries.reserve(nnz); @@ -102,8 +101,10 @@ auto generate_random_csr(dr::index shape, double density = 0.01, template auto generate_band_csr(I size, std::size_t up_band = 3, - std::size_t down_band = 3) { - std::size_t nnz = (1 + up_band + down_band) * size - (up_band * (up_band + 1) / 2) - (down_band * (down_band + 1) / 2); + std::size_t down_band = 3) { + std::size_t nnz = (1 + up_band + down_band) * size - + (up_band * (up_band + 1) / 2) - + (down_band * (down_band + 1) / 2); T *values = new T[nnz]; I *rowptr = new I[size + 1]; @@ -114,32 +115,35 @@ auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t r = 0; std::size_t c = 0; for (auto i = 0; i < size; i++) { - for (auto j = std::max(static_cast(i) - static_cast(down_band), static_cast(0)); j < i ; j++) { - values[c] = 1; - colind[c] = static_cast(j); - c++; + for (auto j = std::max(static_cast(i) - + static_cast(down_band), + static_cast(0)); + j < i; j++) { + values[c] = 1; + colind[c] = static_cast(j); + c++; } values[c] = 1; colind[c] = i; c++; - for (auto j = i + 1; j <= i + up_band ; j++) { - if (j >= size) { - continue; - } - values[c] = 1; - colind[c] = j; - c++; + for (auto j = i + 1; j <= i + up_band; j++) { + if (j >= size) { + continue; + } + values[c] = 1; + colind[c] = j; + c++; } rowptr[r + 1] = c; r++; - } for (; r < size; r++) { rowptr[r + 1] = nnz; } - return dr::views::csr_matrix_view(values, rowptr, colind, {size, size}, nnz, 0); + return dr::views::csr_matrix_view(values, rowptr, colind, {size, size}, + nnz, 0); } } // namespace dr diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp index b86909c18f..1a845008b1 100644 --- a/include/dr/detail/matrix_io.hpp +++ b/include/dr/detail/matrix_io.hpp @@ -44,20 +44,20 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, std::size_t r = 0; std::size_t c = 0; for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) { - auto &&[index, value] = *iter; + auto &&[index, value] = *iter; - auto &&[i, j] = index; + auto &&[i, j] = index; - values[c] = value; - colind[c] = j; + values[c] = value; + colind[c] = j; - while (r < i) { - assert(r + 1 <= shape[0]); - // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); - rowptr[r + 1] = c; - r++; - } - c++; + while (r < i) { + assert(r + 1 <= shape[0]); + // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); + rowptr[r + 1] = c; + r++; + } + c++; assert(c <= nnz); // throw std::runtime_error("csr_matrix_impl_: given invalid matrix"); @@ -68,12 +68,12 @@ auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz, } return dr::views::csr_matrix_view(values, rowptr, colind, - dr::index(shape[0], shape[1]), nnz, 0); + dr::index(shape[0], shape[1]), nnz, 0); } template -auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t nnz, - Allocator &&allocator) { +auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape, + std::size_t nnz, Allocator &&allocator) { auto &&[v, j] = *csr_matrix.begin()->begin(); using T = std::remove_reference_t; @@ -118,7 +118,7 @@ auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape, std::size_t /// a coo_matrix data structure with its contents. template inline csr_matrix_base read_csr_matrix_base(std::string file_path, - bool one_indexed = true) { + bool one_indexed = true) { using size_type = std::size_t; std::ifstream f; @@ -249,7 +249,8 @@ auto read_csr(std::string file_path, bool one_indexed = true) { auto m = __detail::read_csr_matrix_base(file_path, one_indexed); auto shape = m.shape(); auto nnz = m.size(); - auto t = __detail::convert_csr_base_to_csr(m, shape, nnz, std::allocator{}); + auto t = + __detail::convert_csr_base_to_csr(m, shape, nnz, std::allocator{}); return t; } diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp index 52c0fb8e5a..35c98eb8f7 100644 --- a/include/dr/mp.hpp +++ b/include/dr/mp.hpp @@ -87,3 +87,5 @@ #include #include #include +#include +#include diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp index a709bfeaa6..14c61b41d2 100644 --- a/include/dr/mp/algorithms/matrix/gemv.hpp +++ b/include/dr/mp/algorithms/matrix/gemv.hpp @@ -4,13 +4,12 @@ #pragma once #include +#include +#include #include #include #include #include -#include -#include - namespace dr::mp { @@ -18,7 +17,8 @@ template C, typename Alloc, typename Backend, typename MatDistr> requires(vector_multiplicable) void gemv(int root, C &res, - distributed_sparse_matrix &a, broadcasted_vector b) { + distributed_sparse_matrix &a, + broadcasted_vector b) { if (default_comm().rank() == root) { assert(a.shape().first == res.size()); assert(a.shape().second == b.size()); @@ -30,7 +30,8 @@ template C, typename Alloc, typename Backend, typename MatDistr> requires(vector_multiplicable) void gemv(int root, C &res, - distributed_sparse_matrix &a, broadcasted_slim_matrix b) { + distributed_sparse_matrix &a, + broadcasted_slim_matrix b) { if (default_comm().rank() == root) { assert(a.shape().first * b.width() == res.size()); } @@ -40,7 +41,8 @@ void gemv(int root, C &res, template C, typename Alloc, typename Backend, typename MatDistr> requires(vector_multiplicable) -void gemv(C &res, distributed_sparse_matrix &a, broadcasted_vector b) { +void gemv(C &res, distributed_sparse_matrix &a, + broadcasted_vector b) { std::vector workspace(res.size()); gemv(0, workspace, a, b); auto tmp = new T[res.size()]; @@ -55,7 +57,8 @@ void gemv(C &res, distributed_sparse_matrix &a, broadca template C, typename Alloc, typename Backend, typename MatDistr> requires(vector_multiplicable) -void gemv(C &res, distributed_sparse_matrix &a, broadcasted_slim_matrix b) { +void gemv(C &res, distributed_sparse_matrix &a, + broadcasted_slim_matrix b) { std::vector workspace(res.size()); gemv(0, workspace, a, b); auto tmp = new T[res.size()]; diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp index 66a636ad7c..e59ec10391 100644 --- a/include/dr/mp/containers/broadcasted_slim_matrix.hpp +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -6,83 +6,78 @@ namespace dr::mp { - template > class broadcasted_slim_matrix { - public: - broadcasted_slim_matrix() = default; +public: + broadcasted_slim_matrix() = default; - void broadcast_data(std::size_t height, std::size_t width, std::size_t root, T** root_data, dr::communicator comm) { - if (_data != nullptr) { - destroy_data(); - } - _data_size = height * width; - _height = height; - _width = width; - _data = alloc.allocate(_data_size); - if (comm.rank() == root) { - for (auto i = 0; i < width; i++) { - if (use_sycl()) { - __detail::sycl_copy(root_data[i], root_data[i] + height, _data + height * i); - } - else { - rng::copy(root_data[i], root_data[i] + height, _data + height * i); - } - } - } - comm.bcast(_data, sizeof(T) * _data_size, root); + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, + T **root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); } - - template - void broadcast_data(std::size_t height, std::size_t width, std::size_t root, R root_data, dr::communicator comm) { - if (_data != nullptr) { - destroy_data(); - } - _data_size = height * width; - _height = height; - _width = width; - _data = alloc.allocate(_data_size); - if (comm.rank() == root) { - if (use_sycl()) { - __detail::sycl_copy(std::to_address(root_data.begin()), std::to_address(root_data.end()), _data); - } - else { - rng::copy(root_data.begin(), root_data.end(), _data); - } + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + for (auto i = 0; i < width; i++) { + if (use_sycl()) { + __detail::sycl_copy(root_data[i], root_data[i] + height, + _data + height * i); + } else { + rng::copy(root_data[i], root_data[i] + height, _data + height * i); } - auto position = 0; - auto reminder = sizeof(T) * _data_size; - while (reminder > INT_MAX) { - comm.bcast(((uint8_t*)_data) + position, INT_MAX, root); - position += INT_MAX; - reminder -= INT_MAX; - } - comm.bcast(((uint8_t*)_data) + position, reminder, root); - - } - - void destroy_data() { - alloc.deallocate(_data, _data_size); - _data_size = 0; - _data = nullptr; + } } + comm.bcast(_data, sizeof(T) * _data_size, root); + } - T* operator[](std::size_t index) { - return _data + _height * index; + template + void broadcast_data(std::size_t height, std::size_t width, std::size_t root, + R root_data, dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); } - - T* broadcasted_data() { - return _data; + _data_size = height * width; + _height = height; + _width = width; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), + std::to_address(root_data.end()), _data); + } else { + rng::copy(root_data.begin(), root_data.end(), _data); + } } - auto width() { - return _width; + auto position = 0; + auto reminder = sizeof(T) * _data_size; + while (reminder > INT_MAX) { + comm.bcast(((uint8_t *)_data) + position, INT_MAX, root); + position += INT_MAX; + reminder -= INT_MAX; } - private: - T* _data = nullptr; - std::size_t _data_size = 0; - std::size_t _width = 0; - std::size_t _height = 0; + comm.bcast(((uint8_t *)_data) + position, reminder, root); + } + + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T *operator[](std::size_t index) { return _data + _height * index; } + + T *broadcasted_data() { return _data; } + auto width() { return _width; } + +private: + T *_data = nullptr; + std::size_t _data_size = 0; + std::size_t _width = 0; + std::size_t _height = 0; - Allocator alloc; + Allocator alloc; }; -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp index a06ddf2e92..6058067563 100644 --- a/include/dr/mp/containers/broadcasted_vector.hpp +++ b/include/dr/mp/containers/broadcasted_vector.hpp @@ -6,54 +6,48 @@ namespace dr::mp { - template > class broadcasted_vector { - public: - broadcasted_vector() = default; - - template - void broadcast_data(std::size_t data_size, std::size_t root, R root_data, dr::communicator comm) { - if (_data != nullptr) { - destroy_data(); - } - _data_size = data_size; - _data = alloc.allocate(_data_size); - if (comm.rank() == root) { - if (use_sycl()) { - __detail::sycl_copy(std::to_address(root_data.begin()), std::to_address(root_data.end()), _data); - } - else { - rng::copy(root_data.begin(), root_data.end(), _data); - } - } - comm.bcast(_data, sizeof(T) * _data_size, root); - } - - void destroy_data() { - alloc.deallocate(_data, _data_size); - _data_size = 0; - _data = nullptr; - } +public: + broadcasted_vector() = default; - T& operator[](std::size_t index) { - return _data[index]; + template + void broadcast_data(std::size_t data_size, std::size_t root, R root_data, + dr::communicator comm) { + if (_data != nullptr) { + destroy_data(); } - - T* broadcasted_data() { - return _data; + _data_size = data_size; + _data = alloc.allocate(_data_size); + if (comm.rank() == root) { + if (use_sycl()) { + __detail::sycl_copy(std::to_address(root_data.begin()), + std::to_address(root_data.end()), _data); + } else { + rng::copy(root_data.begin(), root_data.end(), _data); + } } + comm.bcast(_data, sizeof(T) * _data_size, root); + } - auto size() { - return _data_size; - } - - auto begin() const { return _data; } - auto end() const { return begin() + _data_size; } - - private: - T* _data = nullptr; - std::size_t _data_size = 0; - Allocator alloc; + void destroy_data() { + alloc.deallocate(_data, _data_size); + _data_size = 0; + _data = nullptr; + } + + T &operator[](std::size_t index) { return _data[index]; } + + T *broadcasted_data() { return _data; } + + auto size() { return _data_size; } + + auto begin() const { return _data; } + auto end() const { return begin() + _data_size; } + +private: + T *_data = nullptr; + std::size_t _data_size = 0; + Allocator alloc; }; -} \ No newline at end of file +} // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp index ffc208924d..f03a5d09e6 100644 --- a/include/dr/mp/containers/distributed_sparse_matrix.hpp +++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp @@ -4,7 +4,6 @@ #pragma once #include #include -#include #include namespace dr::mp { @@ -22,9 +21,10 @@ concept matrix_distibution = requires(T t, std::vector res, int *input) { }; template -concept vector_multiplicable = requires(T t, std::vector res, T::elem_type *input) { - t.local_gemv_and_collect(int(), res, input, 1); -}; +concept vector_multiplicable = + requires(T t, std::vector res, T::elem_type *input) { + t.local_gemv_and_collect(int(), res, input, 1); + }; template > @@ -152,7 +152,8 @@ class distributed_sparse_matrix { template requires(vector_multiplicable) - auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t val_width) const { + auto local_gemv_and_collect(std::size_t root, C &res, T *vals, + std::size_t val_width) const { distribution_.local_gemv_and_collect(root, res, vals, val_width); } diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 19b91b3510..d63c4c084f 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -64,18 +64,17 @@ class MpiBackend { #if (MPI_VERSION >= 4) || \ (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) if (mp::use_sycl()) { - // 32-bit API inside for sycl based buffers + // 32-bit API inside for sycl based buffers for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { - std::size_t s = std::min(remainder, (std::size_t)INT_MAX); - DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", - default_comm().rank(), __LINE__, s, off, offset + off); - win_.put((uint8_t *)src + off, s, segment_index, offset + off); - off += s; - remainder -= s; - } - } - else { - // 64-bit API inside + std::size_t s = std::min(remainder, (std::size_t)INT_MAX); + DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", + default_comm().rank(), __LINE__, s, off, offset + off); + win_.put((uint8_t *)src + off, s, segment_index, offset + off); + off += s; + remainder -= s; + } + } else { + // 64-bit API inside win_.put(src, datalen, segment_index, offset); } #else diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 817023ea4f..132cdbc154 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -20,7 +20,6 @@ class csr_eq_distribution { csr_eq_distribution &operator=(const csr_eq_distribution &) = delete; csr_eq_distribution(csr_eq_distribution &&) { assert(false); } - /// Constructor csr_eq_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { @@ -33,8 +32,6 @@ class csr_eq_distribution { if (rows_data_ != nullptr) { rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); } - - // delete halo_; TODO } } std::size_t get_id_in_segment(std::size_t offset) const { @@ -48,7 +45,8 @@ class csr_eq_distribution { auto shape() const { return shape_; } void fence() const { rows_backend_.fence(); } - template auto local_gemv(C &res, T* vals, std::size_t vals_width) const { + template + auto local_gemv(C &res, T *vals, std::size_t vals_width) const { auto rank = rows_backend_.getrank(); if (nnz_ <= segment_size_ * rank) { return; @@ -66,50 +64,46 @@ class csr_eq_distribution { std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; auto division = real_segment_size / 50; - auto one_computation_size = - (real_segment_size + division - 1) / division; + auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; - // fmt::print("{} {} {}\n", division, real_segment_size / 100, max_row_size_ * 10); - // auto begin = std::chrono::high_resolution_clock::now(); - dr::__detail::parallel_for_workaround(dr::mp::sycl_queue(), sycl::range<1>{division}, - [=](auto idx) { - std::size_t lower_bound = one_computation_size * idx; - std::size_t upper_bound = - std::min(one_computation_size * (idx + 1), real_segment_size); - std::size_t position = lower_bound + offset; - std::size_t first_row = rng::distance( - local_data, std::upper_bound( - local_data, local_data + row_size, position) - - 1); - for (auto j = 0; j < vals_width; j++) { - auto row = first_row; - T sum = 0; - - for (auto i = lower_bound; i < upper_bound; i++) { - while (row + 1 < row_size && - local_data[row + 1] <= offset + i) { - sycl::atomic_ref - c_ref(res[row + j * res_col_len]); - c_ref += sum; - row++; - sum = 0; - } - auto colNum = localCols[i] + j * vals_len; - auto matrixVal = vals[colNum]; - auto vectorVal = localVals[i]; + dr::__detail::parallel_for_workaround( + dr::mp::sycl_queue(), sycl::range<1>{division}, + [=](auto idx) { + std::size_t lower_bound = one_computation_size * idx; + std::size_t upper_bound = + std::min(one_computation_size * (idx + 1), real_segment_size); + std::size_t position = lower_bound + offset; + std::size_t first_row = rng::distance( + local_data, + std::upper_bound(local_data, local_data + row_size, position) - + 1); + for (auto j = 0; j < vals_width; j++) { + auto row = first_row; + T sum = 0; - sum += matrixVal * vectorVal; + for (auto i = lower_bound; i < upper_bound; i++) { + while (row + 1 < row_size && + local_data[row + 1] <= offset + i) { + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; + row++; + sum = 0; } - sycl::atomic_ref - c_ref(res[row + j * res_col_len]); - c_ref += sum; + auto colNum = localCols[i] + j * vals_len; + auto matrixVal = vals[colNum]; + auto vectorVal = localVals[i]; + + sum += matrixVal * vectorVal; } - }).wait(); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count() * 1000; - // fmt::print("timeDuration eq: {} {} {} {}\n", duration, size, real_segment_size * vals_width, rank); + sycl::atomic_ref + c_ref(res[row + j * res_col_len]); + c_ref += sum; + } + }) + .wait(); } else { auto row_i = -1; auto position = segment_size_ * rank; @@ -124,44 +118,31 @@ class csr_eq_distribution { current_row_position = rows_data_[row_i + 1]; } for (int j = 0; j < vals_width; j++) { - res[row_i + j * res_col_len] += local_vals[i] * vals[local_cols[i] + j * vals_len]; + res[row_i + j * res_col_len] += + local_vals[i] * vals[local_cols[i] + j * vals_len]; } } - - // fmt::print("offset, rank {} {}\n", row_offsets_[ - // rows_backend_.getrank()], rows_backend_.getrank()); for (int i = 0; i - // < size; i++) { - // fmt::print("ledata, rank, i {} {} {}\n", res[i], - // rows_backend_.getrank(), i); - // } } } template - auto local_gemv_and_collect(std::size_t root, C &res, T* vals, std::size_t vals_width) const { + auto local_gemv_and_collect(std::size_t root, C &res, T *vals, + std::size_t vals_width) const { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; - auto res_alloc = alloc.allocate( row_sizes_[default_comm().rank()] * vals_width); + auto res_alloc = + alloc.allocate(row_sizes_[default_comm().rank()] * vals_width); if (use_sycl()) { - sycl_queue().fill(res_alloc, 0, row_sizes_[default_comm().rank()] * vals_width).wait(); - } - else { - std::fill(res_alloc, res_alloc + row_sizes_[default_comm().rank()] * vals_width, 0); + sycl_queue() + .fill(res_alloc, 0, row_sizes_[default_comm().rank()] * vals_width) + .wait(); + } else { + std::fill(res_alloc, + res_alloc + row_sizes_[default_comm().rank()] * vals_width, 0); } - - // auto begin = std::chrono::high_resolution_clock::now(); - local_gemv(res_alloc, vals, vals_width); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); - // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); - // begin = std::chrono::high_resolution_clock::now(); + local_gemv(res_alloc, vals, vals_width); gather_gemv_vector(root, res, res_alloc, vals_width); - // end = std::chrono::high_resolution_clock::now(); - // duration = std::chrono::duration(end - begin).count(); - // size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); - // fmt::print("rows gather time {} {} {}\n", duration * 1000, size, default_comm().rank()); fence(); alloc.deallocate(res_alloc, row_sizes_[default_comm().rank()] * vals_width); } @@ -170,37 +151,34 @@ class csr_eq_distribution { friend csr_eq_segment_iterator; template - void gather_gemv_vector(std::size_t root, C &res, A &partial_res, std::size_t vals_width) const { + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, + std::size_t vals_width) const { auto communicator = default_comm(); __detail::allocator alloc; - long long* counts = new long long[communicator.size()]; + long long *counts = new long long[communicator.size()]; for (auto i = 0; i < communicator.size(); i++) { counts[i] = row_sizes_[i] * sizeof(T) * vals_width; } if (communicator.rank() == root) { - long* offsets = new long[communicator.size()]; + long *offsets = new long[communicator.size()]; offsets[0] = 0; for (auto i = 0; i < communicator.size() - 1; i++) { offsets[i + 1] = offsets[i] + counts[i]; } auto gathered_res = alloc.allocate(max_row_size_ * vals_width); communicator.gatherv(partial_res, counts, offsets, gathered_res, root); - // communicator.gather_typed(partial_res, gathered_res, max_row_size_ * vals_width, root); - T* gathered_res_host; - + T *gathered_res_host; + if (use_sycl()) { gathered_res_host = new T[max_row_size_ * vals_width]; - __detail::sycl_copy(gathered_res, gathered_res_host, max_row_size_ * vals_width); - } - else { + __detail::sycl_copy(gathered_res, gathered_res_host, + max_row_size_ * vals_width); + } else { gathered_res_host = gathered_res; } rng::fill(res, 0); - - // auto begin = std::chrono::high_resolution_clock::now(); - for (auto k = 0; k < vals_width; k++) { auto current_offset = 0; for (auto i = 0; i < communicator.size(); i++) { @@ -208,23 +186,20 @@ class csr_eq_distribution { auto last_row = row_offsets_[i] + row_sizes_[i]; auto row_size = row_sizes_[i]; for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[0]] += gathered_res_host[vals_width * current_offset + k * row_size + j - first_row]; + res[j + k * shape_[0]] += + gathered_res_host[vals_width * current_offset + k * row_size + + j - first_row]; } current_offset += row_sizes_[i]; } } - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // fmt::print("gather time {}\n", duration); if (use_sycl()) { delete[] gathered_res_host; } delete[] offsets; alloc.deallocate(gathered_res, max_row_size_ * vals_width); } else { - // communicator.gather_typed(partial_res, static_cast(nullptr), max_row_size_ * vals_width, - // root); communicator.gatherv(partial_res, counts, nullptr, nullptr, root); } delete[] counts; @@ -322,15 +297,6 @@ class csr_eq_distribution { std::min(segment_size_, nnz_ - i), segment_size_); } fence(); - // for (int i = 0; i < row_size_; i++) { - // fmt::print("row, i, rank {} {} {}\n", rows_data_[i], i, rank); - // } - // fence(); - // for (int i = 0; i < vals_data_->segments()[rank].size(); i++) { - // fmt::print("val, col, i, rank {} {} {} {}\n", - // vals_data_->segments()[rank][i], cols_data_->segments()[rank][i],i, - // rank); - // } } std::size_t segment_size_ = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index c6cb46955b..8dd21a65e9 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -21,7 +21,6 @@ class csr_row_distribution { csr_row_distribution &operator=(const csr_row_distribution &) = delete; csr_row_distribution(csr_row_distribution &&) { assert(false); } - /// Constructor csr_row_distribution(dr::views::csr_matrix_view csr_view, distribution dist = distribution(), std::size_t root = 0) { @@ -35,8 +34,6 @@ class csr_row_distribution { vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); } - - // delete halo_; TODO } } std::size_t get_id_in_segment(std::size_t offset) const { @@ -58,7 +55,8 @@ class csr_row_distribution { vals_backend_.fence(); cols_backend_.fence(); } - template auto local_gemv(C &res, T* vals, std::size_t vals_width) const { + template + auto local_gemv(C &res, T *vals, std::size_t vals_width) const { auto rank = cols_backend_.getrank(); if (shape_[0] <= segment_size_ * rank) return; @@ -74,43 +72,43 @@ class csr_row_distribution { auto res_col_len = segment_size_; std::size_t wg = 32; while (vals_width * size * wg > INT_MAX) { - // this check is necessary, because sycl does not permit ranges exceeding integer limit + // this check is necessary, because sycl does not permit ranges + // exceeding integer limit wg /= 2; } assert(wg > 0); - // auto begin = std::chrono::high_resolution_clock::now(); - dr::mp::sycl_queue().submit([&](auto &&h) { - h.parallel_for(sycl::nd_range<1>(vals_width * size * wg, wg), [=](auto item) { - auto input_j = item.get_group(0) / size; - auto idx = item.get_group(0) % size; - auto local_id = item.get_local_id(); - auto group_size = item.get_local_range(0); - std::size_t lower_bound = 0; - if (rows_data[idx] > offset) { - lower_bound = rows_data[idx] - offset; - } - std::size_t upper_bound = real_segment_size; - if (idx < size - 1) { - upper_bound = rows_data[idx + 1] - offset; - } - T sum = 0; - for (auto i = lower_bound + local_id; i < upper_bound; i += group_size) { - auto colNum = local_cols[i]; - auto matrixVal = vals[colNum + input_j * vals_len]; - auto vectorVal = local_vals[i]; - sum += matrixVal * vectorVal; - } - - sycl::atomic_ref - c_ref(res[idx + input_j * res_col_len]); - c_ref += sum; - }); + dr::mp::sycl_queue() + .submit([&](auto &&h) { + h.parallel_for( + sycl::nd_range<1>(vals_width * size * wg, wg), [=](auto item) { + auto input_j = item.get_group(0) / size; + auto idx = item.get_group(0) % size; + auto local_id = item.get_local_id(); + auto group_size = item.get_local_range(0); + std::size_t lower_bound = 0; + if (rows_data[idx] > offset) { + lower_bound = rows_data[idx] - offset; + } + std::size_t upper_bound = real_segment_size; + if (idx < size - 1) { + upper_bound = rows_data[idx + 1] - offset; + } + T sum = 0; + for (auto i = lower_bound + local_id; i < upper_bound; + i += group_size) { + auto colNum = local_cols[i]; + auto matrixVal = vals[colNum + input_j * vals_len]; + auto vectorVal = local_vals[i]; + sum += matrixVal * vectorVal; + } - }).wait(); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count() * 1000; - // fmt::print("timeDuration b: {} {} {}\n", duration, size, real_segment_size * vals_width); + sycl::atomic_ref + c_ref(res[idx + input_j * res_col_len]); + c_ref += sum; + }); + }) + .wait(); } else { auto local_rows = dr::mp::local_segment(*rows_data_); auto val_count = val_sizes_[rank]; @@ -124,30 +122,26 @@ class csr_row_distribution { current_row_position = local_rows[row_i + 1]; } for (auto j = 0; j < vals_width; j++) { - res[row_i + j * segment_size_] += vals_data_[i] * vals[cols_data_[i] + j * vals_len]; + res[row_i + j * segment_size_] += + vals_data_[i] * vals[cols_data_[i] + j * vals_len]; } } } } template - auto local_gemv_and_collect(std::size_t root, C &res, T* &vals, std::size_t vals_width) const { + auto local_gemv_and_collect(std::size_t root, C &res, T *&vals, + std::size_t vals_width) const { assert(res.size() == shape_.first * vals_width); __detail::allocator alloc; auto res_alloc = alloc.allocate(segment_size_ * vals_width); if (use_sycl()) { sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width).wait(); - } - else { + } else { std::fill(res_alloc, res_alloc + segment_size_ * vals_width, 0); } - - // auto begin = std::chrono::high_resolution_clock::now(); + local_gemv(res_alloc, vals, vals_width); - // auto end = std::chrono::high_resolution_clock::now(); - // double duration = std::chrono::duration(end - begin).count(); - // auto size = std::min(segment_size_, shape_[0] - segment_size_ * default_comm().rank()); - // fmt::print("rows gemv time {} {} {}\n", duration * 1000, size, default_comm().rank()); gather_gemv_vector(root, res, res_alloc, vals_width); fence(); @@ -158,14 +152,17 @@ class csr_row_distribution { friend csr_row_segment_iterator; template - void gather_gemv_vector(std::size_t root, C &res, A &partial_res, std::size_t vals_width) const { + void gather_gemv_vector(std::size_t root, C &res, A &partial_res, + std::size_t vals_width) const { auto communicator = default_comm(); __detail::allocator alloc; if (communicator.rank() == root) { - auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); - communicator.gather_typed(partial_res, scratch, segment_size_ * vals_width, root); - T* temp = nullptr; + auto scratch = + alloc.allocate(segment_size_ * communicator.size() * vals_width); + communicator.gather_typed(partial_res, scratch, + segment_size_ * vals_width, root); + T *temp = nullptr; if (use_sycl()) { temp = new T[res.size()]; } @@ -173,16 +170,20 @@ class csr_row_distribution { if (j * segment_size_ >= shape_.first) { break; } - auto comm_segment_size = std::min(segment_size_, shape_.first - j * segment_size_); + auto comm_segment_size = + std::min(segment_size_, shape_.first - j * segment_size_); for (auto i = 0; i < vals_width; i++) { - auto piece_start = scratch + j * vals_width * segment_size_ + i * segment_size_; - + auto piece_start = + scratch + j * vals_width * segment_size_ + i * segment_size_; + if (use_sycl()) { - __detail::sycl_copy(piece_start, temp + shape_.first * i + j * segment_size_, comm_segment_size); - } - else { - std::copy(piece_start, piece_start + comm_segment_size, res.begin() + shape_.first * i + j * segment_size_); + __detail::sycl_copy(piece_start, + temp + shape_.first * i + j * segment_size_, + comm_segment_size); + } else { + std::copy(piece_start, piece_start + comm_segment_size, + res.begin() + shape_.first * i + j * segment_size_); } } } @@ -190,16 +191,11 @@ class csr_row_distribution { std::copy(temp, temp + res.size(), res.begin()); delete[] temp; } - // for (auto i = 0; i < segment_size_ * communicator.size() * vals_width; i++) { - // fmt::print("{} {} {}\n", i, scratch[i], segment_size_); - // } - // for (auto i = 0; i < vals_width * shape_.first; i++) { - // fmt::print("{} {} {}\n", i, res[i], segment_size_); - // } - alloc.deallocate(scratch, segment_size_ * communicator.size()* vals_width); + alloc.deallocate(scratch, + segment_size_ * communicator.size() * vals_width); } else { - communicator.gather_typed(partial_res, static_cast(nullptr), segment_size_ * vals_width, - root); + communicator.gather_typed(partial_res, static_cast(nullptr), + segment_size_ * vals_width, root); } } void init(dr::views::csr_matrix_view csr_view, auto dist, @@ -262,8 +258,6 @@ class csr_row_distribution { } delete[] val_information; vals_size_ = std::max(val_sizes_[rank], static_cast(1)); - // fmt::print("dfsa {} {} {} {}\n", vals_size_, - // val_sizes_[rank],lower_limit, rank); cols_data_ = static_cast(cols_backend_.allocate(vals_size_ * sizeof(I))); @@ -287,34 +281,10 @@ class csr_row_distribution { std::size_t segment_index = 0; segment_size_ = rows_data_->segment_size(); for (std::size_t i = 0; i < default_comm().size(); i++) { - // TODO fix segment creation, to include proper sizes, basing on - // val_offsets; segments_.emplace_back( this, segment_index++, val_sizes_[i], std::max(val_sizes_[i], static_cast(1))); } - // if (rank == 0) { - // int ax = 0; - // for (auto x: val_offsets_) { - // fmt::print("{} {}\n", ax++, x); - // } - // for (int i = 0; i < 49; i++) { - // fmt::print("{} {}\n", i, get_segment_from_offset(i)); - // } - // } - // fmt::print(" {} {} {} {}\n",get_segment_from_offset(47), - // get_segment_from_offset(48), get_segment_from_offset(49), - // get_segment_from_offset(50)); for (int i = 0; i < vals_size_; i++) { - // fmt::print("col, val, i, rank {} {} {} {}\n", cols_data_[i], - // vals_data_[i], i, rank); - // } - // fence(); - // if (rank < rows_data_->segments().size()) { - // for (int i = 0; i < rows_data_->segments()[rank].size(); i++) { - // fmt::print("row, i, rank {} {} {}\n", - // rows_data_->segments()[rank][i], i, rank); - // } - // } fence(); } @@ -334,6 +304,5 @@ class csr_row_distribution { std::size_t nnz_; std::vector> segments_; std::shared_ptr> rows_data_; - }; } // namespace dr::mp diff --git a/test/gtest/mp/gemv.cpp b/test/gtest/mp/gemv.cpp index 3afb96bbe7..cf397ad8ed 100644 --- a/test/gtest/mp/gemv.cpp +++ b/test/gtest/mp/gemv.cpp @@ -42,7 +42,7 @@ auto testMatrixGemm(std::size_t m, std::size_t n, auto &a, std::size_t width) { for (auto &&[index, v] : a) { auto &&[i, k] = index; - for (auto j = 0; j < width; j++) { + for (auto j = 0; j < width; j++) { c_ref[i + j * m] += v * base_b[k + j * n]; } } @@ -55,10 +55,10 @@ TEST(SparseMatrix, GemvRow) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } @@ -66,10 +66,10 @@ TEST(SparseMatrix, GemvEq) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } @@ -77,10 +77,10 @@ TEST(SparseMatrix, GemvRowNotSquare) { std::size_t m = 1000; std::size_t k = 10; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } @@ -88,10 +88,10 @@ TEST(SparseMatrix, GemvEqNotSquare) { std::size_t m = 1000; std::size_t k = 10; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } @@ -99,10 +99,10 @@ TEST(SparseMatrix, GemvRowNotSquareDifferentAxis) { std::size_t m = 10; std::size_t k = 1000; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } @@ -110,22 +110,21 @@ TEST(SparseMatrix, GemvEqNotSquareDifferentAxis) { std::size_t m = 10; std::size_t k = 1000; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemv(m, k, a); } - TEST(SparseMatrix, GemmRow) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } @@ -133,10 +132,10 @@ TEST(SparseMatrix, GemmEq) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } @@ -144,10 +143,10 @@ TEST(SparseMatrix, GemmRowNotSquare) { std::size_t m = 1000; std::size_t k = 10; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } @@ -155,10 +154,10 @@ TEST(SparseMatrix, GemmEqNotSquare) { std::size_t m = 1000; std::size_t k = 10; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } @@ -166,10 +165,10 @@ TEST(SparseMatrix, GemmRowNotSquareDifferentAxis) { std::size_t m = 10; std::size_t k = 1000; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } @@ -177,9 +176,9 @@ TEST(SparseMatrix, GemmEqNotSquareDifferentAxis) { std::size_t m = 10; std::size_t k = 1000; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixGemm(m, k, a, 20); } diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp index 92a8517db1..133739d730 100644 --- a/test/gtest/mp/sparse_matrix.cpp +++ b/test/gtest/mp/sparse_matrix.cpp @@ -1,25 +1,29 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + #include "xp-tests.hpp" -auto testMatrixIter(auto& src, auto &matrix) { - EXPECT_TRUE(src.size() == matrix.size()); - std::map, double> entries; - for (auto [index, val]: src) { - entries[{index.first, index.second}] = val; - } - for (auto [index, val]: matrix) { - EXPECT_TRUE((val == entries[{index.first, index.second}])); - } +auto testMatrixIter(auto &src, auto &matrix) { + EXPECT_TRUE(src.size() == matrix.size()); + std::map, double> entries; + for (auto [index, val] : src) { + entries[{index.first, index.second}] = val; + } + for (auto [index, val] : matrix) { + EXPECT_TRUE((val == entries[{index.first, index.second}])); + } } TEST(SparseMatrix, staticAssertEq) { std::size_t m = 100; std::size_t k = 100; - using Dist = dr::mp::csr_eq_distribution; + using Dist = + dr::mp::csr_eq_distribution; static_assert(dr::mp::matrix_distibution); static_assert(dr::mp::vector_multiplicable); auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend> - a(csr, 0); + dr::mp::distributed_sparse_matrix a( + csr, 0); static_assert(std::forward_iterator); static_assert(std::forward_iterator); static_assert(std::forward_iterator); @@ -32,13 +36,13 @@ TEST(SparseMatrix, staticAssertEq) { TEST(SparseMatrix, staticAssertRow) { std::size_t m = 100; std::size_t k = 100; - using Dist = dr::mp::csr_row_distribution; + using Dist = + dr::mp::csr_row_distribution; static_assert(dr::mp::matrix_distibution); static_assert(dr::mp::vector_multiplicable); auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend> - a(csr, 0); + dr::mp::distributed_sparse_matrix a( + csr, 0); static_assert(std::forward_iterator); static_assert(std::forward_iterator); static_assert(std::forward_iterator); @@ -48,15 +52,14 @@ TEST(SparseMatrix, staticAssertRow) { static_assert(dr::distributed_range); } - TEST(SparseMatrix, IterRow) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); testMatrixIter(csr, a); } @@ -64,9 +67,9 @@ TEST(SparseMatrix, IterEq) { std::size_t m = 100; std::size_t k = 100; auto csr = dr::generate_random_csr({m, k}, 0.1f); - dr::mp::distributed_sparse_matrix< - float, unsigned long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - a(csr, 0); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); testMatrixIter(csr, a); } diff --git a/test/gtest/sp/gemv.cpp b/test/gtest/sp/gemv.cpp index 1b3e0bbf64..7a467f26c4 100644 --- a/test/gtest/sp/gemv.cpp +++ b/test/gtest/sp/gemv.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "xp-tests.hpp" #include "dr/detail/coo_matrix.hpp" +#include "xp-tests.hpp" TEST(SparseMatrix, Gemv) { std::size_t m = 100; std::size_t k = 100; @@ -42,7 +42,7 @@ TEST(SparseMatrix, EmptyGemv) { dr::__detail::coo_matrix base; auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); @@ -74,7 +74,7 @@ TEST(SparseMatrix, ZeroVector) { } auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); diff --git a/test/gtest/sp/sparse.cpp b/test/gtest/sp/sparse.cpp index b6fb93e7a5..bf0f1d7b17 100644 --- a/test/gtest/sp/sparse.cpp +++ b/test/gtest/sp/sparse.cpp @@ -17,7 +17,7 @@ TEST(SparseMatrix, IterationForward) { std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); int i = 0; @@ -49,7 +49,7 @@ TEST(SparseMatrix, IterationReverse) { std::vector, T>> reference(base.size()); std::copy(base.begin(), base.end(), reference.begin()); auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(), - std::allocator{}); + std::allocator{}); dr::sp::sparse_matrix a = dr::sp::create_distributed(csr, dr::sp::row_cyclic()); int i = base.size(); From 05f5c631f40d573a7c98dde44cd8643d9338e172 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 12 Nov 2024 06:49:07 -0800 Subject: [PATCH 39/68] Fix compilation on borealis --- include/dr/detail/index.hpp | 1 + test/gtest/sp/gemv.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/dr/detail/index.hpp b/include/dr/detail/index.hpp index 3c496e8a57..eb21398d06 100644 --- a/include/dr/detail/index.hpp +++ b/include/dr/detail/index.hpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace dr { diff --git a/test/gtest/sp/gemv.cpp b/test/gtest/sp/gemv.cpp index 7a467f26c4..f573d6bc0e 100644 --- a/test/gtest/sp/gemv.cpp +++ b/test/gtest/sp/gemv.cpp @@ -5,8 +5,8 @@ #include "dr/detail/coo_matrix.hpp" #include "xp-tests.hpp" TEST(SparseMatrix, Gemv) { - std::size_t m = 100; - std::size_t k = 100; + long m = 100; + long k = 100; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -94,8 +94,8 @@ TEST(SparseMatrix, ZeroVector) { } TEST(SparseMatrix, NotSquareMatrix) { - std::size_t m = 10; - std::size_t k = 1000; + long m = 10; + long k = 1000; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -124,8 +124,8 @@ TEST(SparseMatrix, NotSquareMatrix) { } TEST(SparseMatrix, NotSquareMatrixOtherAxis) { - std::size_t m = 1000; - std::size_t k = 10; + long m = 1000; + long k = 10; dr::sp::sparse_matrix a( {m, k}, 0.1f, @@ -154,8 +154,8 @@ TEST(SparseMatrix, NotSquareMatrixOtherAxis) { } TEST(SparseMatrix, VerySparseMatrix) { - std::size_t m = 100; - std::size_t k = 100; + long m = 100; + long k = 100; dr::sp::sparse_matrix a( {m, k}, 0.001f, From 06a6628111faed5771d77931231b75e4127ce2cc Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 13 Nov 2024 13:05:04 +0100 Subject: [PATCH 40/68] fix compilation --- include/dr/detail/coo_matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dr/detail/coo_matrix.hpp b/include/dr/detail/coo_matrix.hpp index 1891510f04..1cf8e4bc83 100644 --- a/include/dr/detail/coo_matrix.hpp +++ b/include/dr/detail/coo_matrix.hpp @@ -110,14 +110,14 @@ class coo_matrix { } iterator find(key_type key) noexcept { - return std::find_if(begin(), end(), [&](auto &&v) { + return std::ranges::find_if(begin(), end(), [&](auto &&v) { auto &&[i, v_] = v; return i == key; }); } const_iterator find(key_type key) const noexcept { - return std::find_if(begin(), end(), [&](auto &&v) { + return std::ranges::find_if(begin(), end(), [&](auto &&v) { auto &&[i, v_] = v; return i == key; }); From aa706f7c3710dc52c38388f1270ea431d5ab7439 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 13 Nov 2024 15:50:25 +0100 Subject: [PATCH 41/68] Fix issues with very small and very big matrices --- include/dr/mp/containers/broadcasted_slim_matrix.hpp | 4 ++-- .../dr/mp/containers/matrix_formats/csr_eq_distribution.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp index e59ec10391..ed16695ae4 100644 --- a/include/dr/mp/containers/broadcasted_slim_matrix.hpp +++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp @@ -51,8 +51,8 @@ class broadcasted_slim_matrix { rng::copy(root_data.begin(), root_data.end(), _data); } } - auto position = 0; - auto reminder = sizeof(T) * _data_size; + std::size_t position = 0; + std::size_t reminder = sizeof(T) * _data_size; while (reminder > INT_MAX) { comm.bcast(((uint8_t *)_data) + position, INT_MAX, root); position += INT_MAX; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 132cdbc154..8e6a3202ab 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -63,7 +63,7 @@ class csr_eq_distribution { auto real_segment_size = std::min(nnz_ - rank * segment_size_, segment_size_); auto local_data = rows_data_; - auto division = real_segment_size / 50; + auto division = std::max(1ul, real_segment_size / 50); auto one_computation_size = (real_segment_size + division - 1) / division; auto row_size = row_size_; dr::__detail::parallel_for_workaround( From 8f1a2b7e8efb1f30fc720704041ba8990812883a Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 13 Nov 2024 16:08:44 +0100 Subject: [PATCH 42/68] Fix compilation on older OneDpl --- benchmarks/gbench/mp/fft3d.cpp | 6 +++++- benchmarks/gbench/sp/fft3d.cpp | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/gbench/mp/fft3d.cpp b/benchmarks/gbench/mp/fft3d.cpp index 5e30a57b44..4bda1e2d7b 100644 --- a/benchmarks/gbench/mp/fft3d.cpp +++ b/benchmarks/gbench/mp/fft3d.cpp @@ -5,7 +5,11 @@ #include "cxxopts.hpp" #include "fmt/core.h" #include "mpi.h" -#include "oneapi/mkl/dft.hpp" +#if (ONEDPL_VERSION_MAJOR >= 2025) + #include "oneapi/mkl/dft.hpp" +#else + #include "oneapi/mkl/dfti.hpp" +#endif #include #include "dr/mp.hpp" diff --git a/benchmarks/gbench/sp/fft3d.cpp b/benchmarks/gbench/sp/fft3d.cpp index 19d4f3aee4..61f9517528 100644 --- a/benchmarks/gbench/sp/fft3d.cpp +++ b/benchmarks/gbench/sp/fft3d.cpp @@ -3,7 +3,11 @@ // SPDX-License-Identifier: BSD-3-Clause #include "cxxopts.hpp" -#include "oneapi/mkl/dft.hpp" +#if (ONEDPL_VERSION_MAJOR >= 2025) + #include "oneapi/mkl/dft.hpp" +#else + #include "oneapi/mkl/dfti.hpp" +#endif #include #include #include From 28e023ecd0fb143e90cfc411f9071b15bcfe1381 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 13 Nov 2024 16:10:34 +0100 Subject: [PATCH 43/68] Fix style --- benchmarks/gbench/mp/fft3d.cpp | 4 ++-- benchmarks/gbench/sp/fft3d.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/gbench/mp/fft3d.cpp b/benchmarks/gbench/mp/fft3d.cpp index 4bda1e2d7b..8ae7d068c6 100644 --- a/benchmarks/gbench/mp/fft3d.cpp +++ b/benchmarks/gbench/mp/fft3d.cpp @@ -6,9 +6,9 @@ #include "fmt/core.h" #include "mpi.h" #if (ONEDPL_VERSION_MAJOR >= 2025) - #include "oneapi/mkl/dft.hpp" +#include "oneapi/mkl/dft.hpp" #else - #include "oneapi/mkl/dfti.hpp" +#include "oneapi/mkl/dfti.hpp" #endif #include diff --git a/benchmarks/gbench/sp/fft3d.cpp b/benchmarks/gbench/sp/fft3d.cpp index 61f9517528..ee1de59ae5 100644 --- a/benchmarks/gbench/sp/fft3d.cpp +++ b/benchmarks/gbench/sp/fft3d.cpp @@ -4,9 +4,9 @@ #include "cxxopts.hpp" #if (ONEDPL_VERSION_MAJOR >= 2025) - #include "oneapi/mkl/dft.hpp" +#include "oneapi/mkl/dft.hpp" #else - #include "oneapi/mkl/dfti.hpp" +#include "oneapi/mkl/dfti.hpp" #endif #include #include From 55185dc0ba0cae7152acf05dbbb74c8388eb6838 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 13 Nov 2024 16:45:12 +0100 Subject: [PATCH 44/68] Some fixes with verions --- benchmarks/gbench/mp/fft3d.cpp | 2 +- benchmarks/gbench/sp/fft3d.cpp | 2 +- include/dr/detail/index.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/gbench/mp/fft3d.cpp b/benchmarks/gbench/mp/fft3d.cpp index 8ae7d068c6..23dcbbdae3 100644 --- a/benchmarks/gbench/mp/fft3d.cpp +++ b/benchmarks/gbench/mp/fft3d.cpp @@ -5,7 +5,7 @@ #include "cxxopts.hpp" #include "fmt/core.h" #include "mpi.h" -#if (ONEDPL_VERSION_MAJOR >= 2025) +#if (__INTEL_LLVM_COMPILER >= 20250000) #include "oneapi/mkl/dft.hpp" #else #include "oneapi/mkl/dfti.hpp" diff --git a/benchmarks/gbench/sp/fft3d.cpp b/benchmarks/gbench/sp/fft3d.cpp index ee1de59ae5..cdf060c88c 100644 --- a/benchmarks/gbench/sp/fft3d.cpp +++ b/benchmarks/gbench/sp/fft3d.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #include "cxxopts.hpp" -#if (ONEDPL_VERSION_MAJOR >= 2025) +#if (__INTEL_LLVM_COMPILER >= 20250000) #include "oneapi/mkl/dft.hpp" #else #include "oneapi/mkl/dfti.hpp" diff --git a/include/dr/detail/index.hpp b/include/dr/detail/index.hpp index eb21398d06..f36e798363 100644 --- a/include/dr/detail/index.hpp +++ b/include/dr/detail/index.hpp @@ -5,10 +5,10 @@ #pragma once #include +#include #include #include #include -#include namespace dr { From b7704ea612aae77f8935d7ddc7cce02201eec7a8 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 20 Nov 2024 15:26:44 +0100 Subject: [PATCH 45/68] Add local to csr_eq_segment --- include/dr/mp/algorithms/reduce.hpp | 3 +- .../matrix_formats/csr_eq_segment.hpp | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp index 839dc83a4a..166abfd44a 100644 --- a/include/dr/mp/algorithms/reduce.hpp +++ b/include/dr/mp/algorithms/reduce.hpp @@ -35,10 +35,11 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) { sycl::known_identity_v, binary_op); } else { dr::drlog.debug(" peel 1st value\n"); + auto base = *rng::begin(r); return std::reduce(dpl_policy(), dr::__detail::direct_iterator(rng::begin(r) + 1), dr::__detail::direct_iterator(rng::end(r)), - sycl_get(*rng::begin(r)), binary_op); + sycl_get(base), binary_op); } } #else diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index f08c1dc4ef..527dc82ffb 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -235,6 +235,36 @@ template class csr_eq_segment_iterator { return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); } + + auto local() const { + const auto my_process_segment_index = dsm_->rows_backend_.getrank(); + + assert(my_process_segment_index == segment_index_); + // auto offset = dsm_->row_offsets_[segment_index_]; + // auto row_size = dsm_->row_size_; + auto segment_size = dsm_->vals_data_->segment_size(); + auto local_vals = dsm_->vals_data_->segments()[segment_index_].begin().local(); + auto local_vals_range = rng::subrange(local_vals, local_vals + segment_size); + auto local_cols = dsm_->cols_data_->segments()[segment_index_].begin().local(); + auto local_cols_range = rng::subrange(local_cols, local_cols + segment_size); + // auto local_rows = dsm_->rows_data_; + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + auto transformer = [&](auto entry) { + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = 0; //TODO fix calculating row - it results in segfault + // auto row = rng::distance( + // local_rows, + // std::upper_bound(local_rows, local_rows + row_size, offset + index) - + // 1); + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + auto transformed_res = rng::transform_view(enumerated_zipped, transformer); + return transformed_res.begin(); + } private: // all fields need to be initialized by default ctor so every default From 4acbad6eed1c55f01029cf2becf3ce9c98402037 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 22 Nov 2024 15:26:33 +0100 Subject: [PATCH 46/68] Add proper local method --- .../matrix_formats/csr_eq_distribution.hpp | 3 +- .../matrix_formats/csr_eq_segment.hpp | 12 ++++--- .../matrix_formats/csr_row_distribution.hpp | 3 +- .../matrix_formats/csr_row_segment.hpp | 33 +++++++++++++++++++ 4 files changed, 45 insertions(+), 6 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 8e6a3202ab..aa969c773b 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -12,6 +12,7 @@ template class csr_eq_distribution { public: using value_type = dr::matrix_entry; + using segment_type = csr_eq_segment; using elem_type = T; using index_type = I; using difference_type = std::ptrdiff_t; @@ -311,7 +312,7 @@ class csr_eq_distribution { distribution distribution_; dr::index shape_; std::size_t nnz_; - std::vector> segments_; + std::vector segments_; std::shared_ptr> vals_data_; std::shared_ptr> cols_data_; }; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index 527dc82ffb..f0aa7425e5 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -240,7 +240,8 @@ template class csr_eq_segment_iterator { const auto my_process_segment_index = dsm_->rows_backend_.getrank(); assert(my_process_segment_index == segment_index_); - // auto offset = dsm_->row_offsets_[segment_index_]; + // const auto offset = dsm_->row_offsets_[segment_index_]; + // assert(offset == 0); // auto row_size = dsm_->row_size_; auto segment_size = dsm_->vals_data_->segment_size(); auto local_vals = dsm_->vals_data_->segments()[segment_index_].begin().local(); @@ -250,19 +251,22 @@ template class csr_eq_segment_iterator { // auto local_rows = dsm_->rows_data_; auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); - auto transformer = [&](auto entry) { + auto transformer = [=](auto entry) { + // assert(offset == 0); auto [index, pair] = entry; auto [val, column] = pair; auto row = 0; //TODO fix calculating row - it results in segfault + // problem originates from the fact that variables cannot be caputed properly by value // auto row = rng::distance( // local_rows, - // std::upper_bound(local_rows, local_rows + row_size, offset + index) - + // std::upper_bound(local_rows, local_rows + row_size, offset) - // 1); dr::index index_obj(row, column); value_type entry_obj(index_obj, val); return entry_obj; }; - auto transformed_res = rng::transform_view(enumerated_zipped, transformer); + auto transformed_res = rng::views::transform(enumerated_zipped, transformer); + // static_assert(std::is_same::value); return transformed_res.begin(); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 8dd21a65e9..4095bde5c8 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -13,6 +13,7 @@ template class csr_row_distribution { public: using value_type = dr::matrix_entry; + using segment_type = csr_row_segment; using elem_type = T; using index_type = I; using difference_type = std::ptrdiff_t; @@ -302,7 +303,7 @@ class csr_row_distribution { distribution distribution_; dr::index shape_; std::size_t nnz_; - std::vector> segments_; + std::vector segments_; std::shared_ptr> rows_data_; }; } // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index f706898667..5cb109a806 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -223,6 +223,39 @@ template class csr_row_segment_iterator { assert(dsm_ != nullptr); return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); + } + + auto local() const { + const auto my_process_segment_index = dsm_->vals_backend_.getrank(); + + assert(my_process_segment_index == segment_index_); + std::size_t offset = dsm_->segment_size_ * segment_index_; + assert(offset == 0); + // auto row_size = dsm_->segment_size_; + auto vals_size = dsm_->vals_size_; + auto local_vals = dsm_->vals_data_; + auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); + auto local_cols = dsm_->cols_data_; + auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); + // auto local_rows = dsm_->rows_data_->segments()[segment_index_].begin().local(); + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + auto transformer = [=](auto entry) { + assert(offset == 0); + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = 0; //TODO fix calculating row - it results in segfault + // problem originates from the fact that variables cannot be caputed properly by value + // auto row = rng::distance( + // local_rows, + // std::upper_bound(local_rows, local_rows + row_size, offset) - + // 1); + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + auto transformed_res = rng::views::transform(enumerated_zipped, transformer); + return transformed_res.begin(); } private: From 1f84ba7b8adf92696e317740f19bd497754cd9a2 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 25 Nov 2024 12:15:28 +0100 Subject: [PATCH 47/68] Add problem to review --- examples/mp/CMakeLists.txt | 1 + examples/mp/local_issue.cpp | 41 +++++++++ .../matrix_formats/csr_row_segment.hpp | 91 ++++++++++++++----- include/dr/views/transform.hpp | 2 +- 4 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 examples/mp/local_issue.cpp diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 54e9db9bd0..1af7dbc45d 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -34,6 +34,7 @@ add_mp_example(hello_world) add_mp_example_no_test(sparse_matrix) add_mp_example_no_test(sparse_benchmark) add_mp_example_no_test(sparse_matrix_matrix_mul) +add_mp_example_no_test(local_issue) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/examples/mp/local_issue.cpp b/examples/mp/local_issue.cpp new file mode 100644 index 0000000000..c6c601ce05 --- /dev/null +++ b/examples/mp/local_issue.cpp @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +namespace mp = dr::mp; + +int main(int argc, char **argv) { + +#ifdef SYCL_LANGUAGE_VERSION + mp::init(sycl::default_selector_v); +#else + mp::init(); +#endif + + dr::views::csr_matrix_view local_data; + auto root = 0; + if (root == dr::mp::default_comm().rank()) { + local_data = dr::generate_band_csr(100, 2, 2); + } + { + mp::distributed_sparse_matrix< + double, long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + m_row(local_data, root); + auto b = m_row.segments()[0].begin().local(); + auto [ind, val] = *b; + auto [n, ma] = ind; + fmt::print("some res 2 {} {} {}\n", val, n, ma); + + } + + if (root == dr::mp::default_comm().rank()) { + dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); + } + mp::finalize(); + + return 0; +} diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index 5cb109a806..b2d5715d70 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -4,8 +4,49 @@ #pragma once + +int some_id_base =0; namespace dr::mp { +namespace __detail { + template + class transform_fn_1 { + public: + using value_type = V; + using index_type = T; + transform_fn_1(std::size_t offset, std::size_t row_size, T* row_ptr): + offset_(offset), row_size_(row_size), row_ptr_(row_ptr) { + assert(offset_ == 0); + myid = some_id_base++; + fmt::print("created {}\n", myid); + } + ~transform_fn_1() { + destroyed = true; + fmt::print("destroyed {}\n", myid); + } + template + auto operator()(P entry) const { + fmt::print("called {}\n", myid); + assert(offset_ == 0); + assert(!destroyed); + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = rng::distance( + row_ptr_, + std::upper_bound(row_ptr_, row_ptr_ + row_size_, offset_) - + 1); + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + } + private: + int myid = 0; + bool destroyed = false; + std::size_t offset_; + std::size_t row_size_; + T* row_ptr_; + }; +} template class csr_row_segment_iterator; template class csr_row_segment_reference { @@ -55,6 +96,10 @@ template class csr_row_segment_iterator { dsm_ = dsm; segment_index_ = segment_index; index_ = index; + if (dsm_->vals_backend_.getrank() == segment_index_) { + elem_view_ = get_elem_view(dsm_, segment_index); + base_iter = elem_view_.begin(); + } } auto operator<=>(const csr_row_segment_iterator &other) const noexcept { @@ -227,40 +272,36 @@ template class csr_row_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); - assert(my_process_segment_index == segment_index_); - std::size_t offset = dsm_->segment_size_ * segment_index_; - assert(offset == 0); - // auto row_size = dsm_->segment_size_; - auto vals_size = dsm_->vals_size_; - auto local_vals = dsm_->vals_data_; + auto [a, b] = *base_iter; + auto [c, d] = a; + fmt::print("aqwsedrftgyhuji {} {} {}\n", b, c, d); + return base_iter; + } + +private: + + static auto get_elem_view(DSM *dsm, std::size_t segment_index) { + std::size_t offset = dsm->segment_size_ * segment_index; + auto row_size = dsm->segment_size_; + auto vals_size = dsm->vals_size_; + auto local_vals = dsm->vals_data_; auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); - auto local_cols = dsm_->cols_data_; + auto local_cols = dsm->cols_data_; auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); - // auto local_rows = dsm_->rows_data_->segments()[segment_index_].begin().local(); + auto local_rows = dsm->rows_data_->segments()[segment_index].begin().local(); auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); - auto transformer = [=](auto entry) { - assert(offset == 0); - auto [index, pair] = entry; - auto [val, column] = pair; - auto row = 0; //TODO fix calculating row - it results in segfault - // problem originates from the fact that variables cannot be caputed properly by value - // auto row = rng::distance( - // local_rows, - // std::upper_bound(local_rows, local_rows + row_size, offset) - - // 1); - dr::index index_obj(row, column); - value_type entry_obj(index_obj, val); - return entry_obj; - }; - auto transformed_res = rng::views::transform(enumerated_zipped, transformer); - return transformed_res.begin(); + auto transformer = __detail::transform_fn_1(offset, row_size, local_rows); + return rng::views::transform(enumerated_zipped, transformer); } -private: // all fields need to be initialized by default ctor so every default // constructed iter is equal to any other default constructed iter + using view_type = decltype(get_elem_view(std::declval(), 0)); + using iter_type = rng::iterator_t; + view_type elem_view_; + iter_type base_iter; DSM *dsm_ = nullptr; std::size_t segment_index_ = 0; std::size_t index_ = 0; diff --git a/include/dr/views/transform.hpp b/include/dr/views/transform.hpp index b99a07ad32..a025a89e01 100644 --- a/include/dr/views/transform.hpp +++ b/include/dr/views/transform.hpp @@ -109,7 +109,7 @@ class transform_iterator { requires(dr::ranges::__detail::has_local) { auto iter = dr::ranges::__detail::local(iter_); - return transform_iterator(iter, fn_); + return transform_iterator(std::move(iter), fn_); } private: From ba20ee33e4fae421cc2ba162c8286fb25c1d2796 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 25 Nov 2024 15:13:50 +0100 Subject: [PATCH 48/68] Moved local view to distribution --- examples/mp/local_issue.cpp | 9 ++- .../matrix_formats/csr_row_distribution.hpp | 69 +++++++++++++++++++ .../matrix_formats/csr_row_segment.hpp | 69 +------------------ 3 files changed, 77 insertions(+), 70 deletions(-) diff --git a/examples/mp/local_issue.cpp b/examples/mp/local_issue.cpp index c6c601ce05..806c3567ee 100644 --- a/examples/mp/local_issue.cpp +++ b/examples/mp/local_issue.cpp @@ -18,7 +18,7 @@ int main(int argc, char **argv) { dr::views::csr_matrix_view local_data; auto root = 0; if (root == dr::mp::default_comm().rank()) { - local_data = dr::generate_band_csr(100, 2, 2); + local_data = dr::generate_band_csr(10, 0, 1); } { mp::distributed_sparse_matrix< @@ -30,8 +30,13 @@ int main(int argc, char **argv) { auto [n, ma] = ind; fmt::print("some res 2 {} {} {}\n", val, n, ma); + auto mapper = [] (auto elem) { auto [a, b] = elem; auto [c, d] = a; return d;}; + auto summer = [](auto x, auto y) { return x + y;}; + auto z2 = dr::transform_view(m_row, mapper); + auto red2 = dr::mp::reduce(z2, 0, summer); + fmt::print("reduced row {} {}\n", red2, m_row.size()); } - + if (root == dr::mp::default_comm().rank()) { dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 4095bde5c8..19cc9bfa5e 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -8,6 +8,42 @@ #include namespace dr::mp { +namespace __detail { + template + class transform_fn_1 { + public: + using value_type = V; + using index_type = T; + transform_fn_1(std::size_t offset, std::size_t row_size, T* row_ptr): + offset_(offset), row_size_(row_size), row_ptr_(row_ptr) { + assert(offset_ == 0); + } + + ~transform_fn_1() { + destroyed = true; + } + template + auto operator()(P entry) const { + assert(offset_ == 0); + assert(!destroyed); + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = 0; + // auto row = rng::distance( + // row_ptr_, + // std::upper_bound(row_ptr_, row_ptr_ + row_size_, offset_ + index) - + // 1); + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + } + private: + bool destroyed = false; + std::size_t offset_; + std::size_t row_size_; + T* row_ptr_; + }; +} template class csr_row_distribution { @@ -287,8 +323,41 @@ class csr_row_distribution { std::max(val_sizes_[i], static_cast(1))); } fence(); + local_view = get_elem_view(vals_size_, cols_data_, vals_data_, rows_data_, rank); } + static auto get_elem_view(std::size_t vals_size, + index_type *local_cols, + elem_type *local_vals, + std::shared_ptr> rows_data, + std::size_t rank) { + auto row_size = rows_data->segment_size(); + std::size_t offset = row_size * rank; + auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); + auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); + // auto local_rows = rows_data->segments()[rank].begin().local(); + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + auto transformer = [=](auto entry){ + assert(offset == 0); + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = 0; + // auto row = rng::distance( + // local_rows, + // std::upper_bound(local_rows, local_rows + row_size, offset_ + index) - + // 1); + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + //__detail::transform_fn_1(offset, row_size, local_rows); + return rng::views::transform(enumerated_zipped, transformer); + } + + using view_type = decltype(get_elem_view(0, nullptr, nullptr, std::shared_ptr>(nullptr),0)); + + view_type local_view; std::size_t segment_size_ = 0; std::size_t vals_size_ = 0; std::vector val_offsets_; diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index b2d5715d70..be7554529d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -5,48 +5,7 @@ #pragma once -int some_id_base =0; namespace dr::mp { -namespace __detail { - template - class transform_fn_1 { - public: - using value_type = V; - using index_type = T; - transform_fn_1(std::size_t offset, std::size_t row_size, T* row_ptr): - offset_(offset), row_size_(row_size), row_ptr_(row_ptr) { - assert(offset_ == 0); - myid = some_id_base++; - fmt::print("created {}\n", myid); - } - - ~transform_fn_1() { - destroyed = true; - fmt::print("destroyed {}\n", myid); - } - template - auto operator()(P entry) const { - fmt::print("called {}\n", myid); - assert(offset_ == 0); - assert(!destroyed); - auto [index, pair] = entry; - auto [val, column] = pair; - auto row = rng::distance( - row_ptr_, - std::upper_bound(row_ptr_, row_ptr_ + row_size_, offset_) - - 1); - dr::index index_obj(row, column); - value_type entry_obj(index_obj, val); - return entry_obj; - } - private: - int myid = 0; - bool destroyed = false; - std::size_t offset_; - std::size_t row_size_; - T* row_ptr_; - }; -} template class csr_row_segment_iterator; template class csr_row_segment_reference { @@ -96,10 +55,6 @@ template class csr_row_segment_iterator { dsm_ = dsm; segment_index_ = segment_index; index_ = index; - if (dsm_->vals_backend_.getrank() == segment_index_) { - elem_view_ = get_elem_view(dsm_, segment_index); - base_iter = elem_view_.begin(); - } } auto operator<=>(const csr_row_segment_iterator &other) const noexcept { @@ -273,35 +228,13 @@ template class csr_row_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); assert(my_process_segment_index == segment_index_); - auto [a, b] = *base_iter; - auto [c, d] = a; - fmt::print("aqwsedrftgyhuji {} {} {}\n", b, c, d); - return base_iter; + return dsm_->local_view.begin(); } private: - static auto get_elem_view(DSM *dsm, std::size_t segment_index) { - std::size_t offset = dsm->segment_size_ * segment_index; - auto row_size = dsm->segment_size_; - auto vals_size = dsm->vals_size_; - auto local_vals = dsm->vals_data_; - auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); - auto local_cols = dsm->cols_data_; - auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); - auto local_rows = dsm->rows_data_->segments()[segment_index].begin().local(); - auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); - auto enumerated_zipped = rng::views::enumerate(zipped_results); - auto transformer = __detail::transform_fn_1(offset, row_size, local_rows); - return rng::views::transform(enumerated_zipped, transformer); - } - // all fields need to be initialized by default ctor so every default // constructed iter is equal to any other default constructed iter - using view_type = decltype(get_elem_view(std::declval(), 0)); - using iter_type = rng::iterator_t; - view_type elem_view_; - iter_type base_iter; DSM *dsm_ = nullptr; std::size_t segment_index_ = 0; std::size_t index_ = 0; From bad5606ba8fd8a1b74d691a30d531fda9eb028a3 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 25 Nov 2024 16:42:49 +0100 Subject: [PATCH 49/68] Add new example of not working code --- examples/mp/local_issue.cpp | 10 ++++++++-- .../matrix_formats/csr_row_distribution.hpp | 12 ++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/mp/local_issue.cpp b/examples/mp/local_issue.cpp index 806c3567ee..b6ead23aa5 100644 --- a/examples/mp/local_issue.cpp +++ b/examples/mp/local_issue.cpp @@ -30,9 +30,15 @@ int main(int argc, char **argv) { auto [n, ma] = ind; fmt::print("some res 2 {} {} {}\n", val, n, ma); - auto mapper = [] (auto elem) { auto [a, b] = elem; auto [c, d] = a; return d;}; - auto summer = [](auto x, auto y) { return x + y;}; + auto mapper = [] (auto elem) { auto [a, b] = elem; auto [c, d] = a; return c;}; auto z2 = dr::transform_view(m_row, mapper); + for (auto x: local_segments(z2)) { + for (auto z : x) + { + fmt::print("some res {}\n", z); + } + } + auto summer = [](auto x, auto y) { return x + y;}; auto red2 = dr::mp::reduce(z2, 0, summer); fmt::print("reduced row {} {}\n", red2, m_row.size()); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 19cc9bfa5e..44acb639c5 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -335,18 +335,18 @@ class csr_row_distribution { std::size_t offset = row_size * rank; auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); - // auto local_rows = rows_data->segments()[rank].begin().local(); + auto local_rows = rows_data->segments()[rank].begin().local(); auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); auto transformer = [=](auto entry){ assert(offset == 0); auto [index, pair] = entry; auto [val, column] = pair; - auto row = 0; - // auto row = rng::distance( - // local_rows, - // std::upper_bound(local_rows, local_rows + row_size, offset_ + index) - - // 1); + // auto row = 0; + auto row = rng::distance( + local_rows, + std::upper_bound(local_rows, local_rows + row_size, offset + index) - + 1); dr::index index_obj(row, column); value_type entry_obj(index_obj, val); return entry_obj; From 8e7f1feb088c28788044186e644eff81f8cbbba5 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 27 Nov 2024 09:38:03 +0100 Subject: [PATCH 50/68] Fix issue with lambda copy --- examples/mp/local_issue.cpp | 33 +++- include/dr/detail/multiply_view.hpp | 146 ++++++++++++++++++ .../matrix_formats/csr_row_distribution.hpp | 79 ++++------ .../matrix_formats/csr_row_segment.hpp | 2 +- include/dr/views/transform.hpp | 2 +- 5 files changed, 205 insertions(+), 57 deletions(-) create mode 100644 include/dr/detail/multiply_view.hpp diff --git a/examples/mp/local_issue.cpp b/examples/mp/local_issue.cpp index b6ead23aa5..c4619c1ed7 100644 --- a/examples/mp/local_issue.cpp +++ b/examples/mp/local_issue.cpp @@ -25,19 +25,40 @@ int main(int argc, char **argv) { double, long, dr::mp::MpiBackend, dr::mp::csr_row_distribution> m_row(local_data, root); - auto b = m_row.segments()[0].begin().local(); - auto [ind, val] = *b; - auto [n, ma] = ind; - fmt::print("some res 2 {} {} {}\n", val, n, ma); auto mapper = [] (auto elem) { auto [a, b] = elem; auto [c, d] = a; return c;}; auto z2 = dr::transform_view(m_row, mapper); - for (auto x: local_segments(z2)) { + for (auto x: local_segments(m_row)) { for (auto z : x) { - fmt::print("some res {}\n", z); + auto [a, b] = z; + auto [c, d] = a; + fmt::print("some res {} {} {}\n", b, c, d); } } + + auto q = dr::mp::sycl_queue(); + auto sum1 = sycl::malloc_shared(1, q); + auto sum2 = sycl::malloc_shared(1, q); + auto sum3 = sycl::malloc_shared(1, q); + auto local_iter = local_segments(m_row); + for (auto x: local_iter) { + q.submit([=](auto &&h) { + h.parallel_for(sycl::nd_range<1>(1,1), [=](auto item) { + for (auto z : x) + { + auto [a, b] = z; + auto [c, d] = a; + sum1[0] = sum1[0] + b; + sum2[0] = sum2[0] + c; + sum3[0] = sum3[0] + d; + } + + }); + }).wait(); + fmt::print("iter vals {} {} {}\n", sum1[0], sum2[0], sum3[0]); + } + auto summer = [](auto x, auto y) { return x + y;}; auto red2 = dr::mp::reduce(z2, 0, summer); fmt::print("reduced row {} {}\n", red2, m_row.size()); diff --git a/include/dr/detail/multiply_view.hpp b/include/dr/detail/multiply_view.hpp new file mode 100644 index 0000000000..174f7c2c24 --- /dev/null +++ b/include/dr/detail/multiply_view.hpp @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include + +#include +#include + +namespace dr::__detail { + +template +class multiply_iterator { +public: + using value_type = std::iter_value_t; + using difference_type = long long; + using iterator = multiply_iterator; + using reference = value_type; + + using pointer = iterator; + + using iterator_category = std::random_access_iterator_tag; + + multiply_iterator(Iter iter, std::size_t len, long long pos) noexcept : iter_(iter), len_(len), pos_(pos) {} + multiply_iterator() noexcept = default; + ~multiply_iterator() noexcept = default; + multiply_iterator(const multiply_iterator &) noexcept = default; + multiply_iterator &operator=(const multiply_iterator &) noexcept = default; + + bool operator==(const multiply_iterator &other) const noexcept { + return iter_ == other.iter_ && pos_ == other.pos_ && len_ == other.len_; + } + + bool operator!=(const multiply_iterator &other) const noexcept { + return iter_ != other.iter_ || pos_ != other.pos_ || len_ != other.len_; + } + + iterator operator+(difference_type offset) const noexcept { + return iterator(iter_, len_, pos_ + offset); + } + + iterator operator-(difference_type offset) const noexcept { + return iterator(iter_, len_, pos_ + offset); + } + + difference_type operator-(iterator other) const noexcept { + return pos_ - other.pos_; + } + + bool operator<(iterator other) const noexcept { return pos_ < other.pos_; } + + bool operator>(iterator other) const noexcept { return pos_ > other.pos_; } + + bool operator<=(iterator other) const noexcept { + return pos_ <= other.pos_; + } + + bool operator>=(iterator other) const noexcept { + return pos_ >= other.pos_; + } + + iterator &operator++() noexcept { + ++pos_; + return *this; + } + + iterator operator++(int) noexcept { + iterator other = *this; + ++(*this); + return other; + } + + iterator &operator--() noexcept { + --pos_; + return *this; + } + + iterator operator--(int) noexcept { + iterator other = *this; + --(*this); + return other; + } + + iterator &operator+=(difference_type offset) noexcept { + pos_ += offset; + return *this; + } + + iterator &operator-=(difference_type offset) noexcept { + pos_ -= offset; + return *this; + } + + reference operator*() const noexcept { return *(iter_ + (pos_ % len_)); } + + reference operator[](difference_type offset) const noexcept { + return *(*this + offset); + } + + friend iterator operator+(difference_type n, iterator iter) { + return iter.pos_ + n; + } + + auto local() const + requires(dr::ranges::__detail::has_local) + { + auto iter = dr::ranges::__detail::local(iter_); + return multiply_iterator(std::move(iter), len_, pos_); + } + +private: + Iter iter_; + std::size_t len_; + long long pos_; +}; + +template + requires(rng::sized_range) +class multiply_view : public rng::view_interface> { +public: + template + multiply_view(R &&r, std::size_t n) + : base_(rng::views::all(std::forward(r))), n_(n) {} + + auto begin() const { return multiply_iterator(rng::begin(base_), base_.size(), 0); } + + auto end() const { return multiply_iterator(rng::begin(base_), base_.size(), n_ * base_.size()); } + + auto size() const + { + return rng::size(base_); + } + +private: + V base_; + std::size_t n_; +}; + +template +multiply_view(R &&r, std::size_t n) -> multiply_view>; + +} // namespace dr diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 44acb639c5..6116d95acd 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -5,48 +5,13 @@ #include #include #include +#include #include namespace dr::mp { -namespace __detail { - template - class transform_fn_1 { - public: - using value_type = V; - using index_type = T; - transform_fn_1(std::size_t offset, std::size_t row_size, T* row_ptr): - offset_(offset), row_size_(row_size), row_ptr_(row_ptr) { - assert(offset_ == 0); - } - - ~transform_fn_1() { - destroyed = true; - } - template - auto operator()(P entry) const { - assert(offset_ == 0); - assert(!destroyed); - auto [index, pair] = entry; - auto [val, column] = pair; - auto row = 0; - // auto row = rng::distance( - // row_ptr_, - // std::upper_bound(row_ptr_, row_ptr_ + row_size_, offset_ + index) - - // 1); - dr::index index_obj(row, column); - value_type entry_obj(index_obj, val); - return entry_obj; - } - private: - bool destroyed = false; - std::size_t offset_; - std::size_t row_size_; - T* row_ptr_; - }; -} - template class csr_row_distribution { + using view_tuple = std::tuple; public: using value_type = dr::matrix_entry; using segment_type = csr_row_segment; @@ -70,6 +35,7 @@ class csr_row_distribution { if (vals_data_ != nullptr) { vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type)); cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type)); + alloc.deallocate(view_helper_const, 1); } } } @@ -323,26 +289,39 @@ class csr_row_distribution { std::max(val_sizes_[i], static_cast(1))); } fence(); - local_view = get_elem_view(vals_size_, cols_data_, vals_data_, rows_data_, rank); + auto local_rows = rows_data_->segments()[rank].begin().local(); + auto my_tuple = std::make_tuple(rows_data_->segment_size(), segment_size_ * rank, local_rows); + view_helper_const = alloc.allocate(1); + + view_helper_const[0] = my_tuple; + + local_view = std::make_shared(get_elem_view(vals_size_, view_helper_const, cols_data_, vals_data_, rank)); } - static auto get_elem_view(std::size_t vals_size, + static auto get_elem_view( + std::size_t vals_size, + view_tuple* helper_tuple, index_type *local_cols, elem_type *local_vals, - std::shared_ptr> rows_data, std::size_t rank) { - auto row_size = rows_data->segment_size(); - std::size_t offset = row_size * rank; auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); - auto local_rows = rows_data->segments()[rank].begin().local(); auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); - auto transformer = [=](auto entry){ + // we need to use multiply_view here, + // because lambda is not properly copied to sycl environment + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view(rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = rng::views::zip(enumerated_zipped, multiply_range); + + auto transformer = [=](auto x) { + auto [entry, tuple] = x; + auto [row_size, offset, local_rows] = tuple; assert(offset == 0); + assert(local_rows[0] == 0); + assert(row_size == 10); auto [index, pair] = entry; auto [val, column] = pair; - // auto row = 0; auto row = rng::distance( local_rows, std::upper_bound(local_rows, local_rows + row_size, offset + index) - @@ -351,13 +330,15 @@ class csr_row_distribution { value_type entry_obj(index_obj, val); return entry_obj; }; - //__detail::transform_fn_1(offset, row_size, local_rows); - return rng::views::transform(enumerated_zipped, transformer); + return rng::transform_view(enumerted_with_data, std::move(transformer)); } - using view_type = decltype(get_elem_view(0, nullptr, nullptr, std::shared_ptr>(nullptr),0)); + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr,0)); + + dr::mp::__detail::allocator alloc; + view_tuple* view_helper_const; + std::shared_ptr local_view; - view_type local_view; std::size_t segment_size_ = 0; std::size_t vals_size_ = 0; std::vector val_offsets_; diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index be7554529d..22b8da5737 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -228,7 +228,7 @@ template class csr_row_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); assert(my_process_segment_index == segment_index_); - return dsm_->local_view.begin(); + return dsm_->local_view->begin(); } private: diff --git a/include/dr/views/transform.hpp b/include/dr/views/transform.hpp index a025a89e01..9c6598c69e 100644 --- a/include/dr/views/transform.hpp +++ b/include/dr/views/transform.hpp @@ -53,7 +53,7 @@ class transform_iterator { bool operator<(iterator other) const noexcept { return iter_ < other.iter_; } - bool operator>(iterator other) const noexcept { return iter_ > iter_; } + bool operator>(iterator other) const noexcept { return iter_ > other.iter_; } bool operator<=(iterator other) const noexcept { return iter_ <= other.iter_; From 3503271729f6148e95f47cea5033fd7dcb8d131e Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Wed, 27 Nov 2024 11:05:14 +0100 Subject: [PATCH 51/68] Make local work with shared memory --- .../matrix_formats/csr_row_distribution.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 6116d95acd..4e9ad94626 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -11,7 +11,7 @@ namespace dr::mp { template class csr_row_distribution { - using view_tuple = std::tuple; + using view_tuple = std::tuple; public: using value_type = dr::matrix_entry; using segment_type = csr_row_segment; @@ -290,7 +290,9 @@ class csr_row_distribution { } fence(); auto local_rows = rows_data_->segments()[rank].begin().local(); - auto my_tuple = std::make_tuple(rows_data_->segment_size(), segment_size_ * rank, local_rows); + auto offset = val_offsets_[rank]; + auto real_row_size = std::min(rows_data_->segment_size(), shape_.first - rows_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset, local_rows); view_helper_const = alloc.allocate(1); view_helper_const[0] = my_tuple; @@ -316,16 +318,13 @@ class csr_row_distribution { auto transformer = [=](auto x) { auto [entry, tuple] = x; - auto [row_size, offset, local_rows] = tuple; - assert(offset == 0); - assert(local_rows[0] == 0); - assert(row_size == 10); + auto [row_size, row_offset, offset, local_rows] = tuple; auto [index, pair] = entry; auto [val, column] = pair; auto row = rng::distance( local_rows, std::upper_bound(local_rows, local_rows + row_size, offset + index) - - 1); + 1) + row_offset; dr::index index_obj(row, column); value_type entry_obj(index_obj, val); return entry_obj; From 44a6e78c253324885047482c15108dedbec3db7c Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 29 Nov 2024 12:25:43 +0100 Subject: [PATCH 52/68] Fix device memory when using local in row distribution --- include/dr/mp/algorithms/reduce.hpp | 3 +-- .../matrix_formats/csr_row_distribution.hpp | 7 ++++++- include/dr/mp/sycl_support.hpp | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp index 166abfd44a..21d5fcbbff 100644 --- a/include/dr/mp/algorithms/reduce.hpp +++ b/include/dr/mp/algorithms/reduce.hpp @@ -35,11 +35,10 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) { sycl::known_identity_v, binary_op); } else { dr::drlog.debug(" peel 1st value\n"); - auto base = *rng::begin(r); return std::reduce(dpl_policy(), dr::__detail::direct_iterator(rng::begin(r) + 1), dr::__detail::direct_iterator(rng::end(r)), - sycl_get(base), binary_op); + sycl_get_deref(rng::begin(r)), binary_op); } } #else diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 4e9ad94626..5160674a5b 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -295,7 +295,12 @@ class csr_row_distribution { auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset, local_rows); view_helper_const = alloc.allocate(1); - view_helper_const[0] = my_tuple; + + if (use_sycl()) { + sycl_queue().memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)).wait(); + } else { + view_helper_const[0] = my_tuple; + } local_view = std::make_shared(get_elem_view(vals_size_, view_helper_const, cols_data_, vals_data_, rank)); } diff --git a/include/dr/mp/sycl_support.hpp b/include/dr/mp/sycl_support.hpp index 791999fe5c..768b3ae795 100644 --- a/include/dr/mp/sycl_support.hpp +++ b/include/dr/mp/sycl_support.hpp @@ -17,6 +17,20 @@ sycl::queue &sycl_queue(); namespace dr::mp::__detail { +//sometimes we only want to dereference iterator inside SYCL +template auto sycl_get_deref(T v) { + using deref_type = decltype(*v); + deref_type temp; + { + sycl::buffer buff(&temp, 1); + sycl_queue().submit([&](auto &&h) { + sycl::accessor access(buff, h, sycl::write_only, sycl::no_init); + h.single_task([=](auto i) { access[0] = *v;}); + }).wait(); + } + return temp; +} + template T sycl_get(T &v) { T temp; sycl_queue().memcpy(&temp, &v, sizeof(v)).wait(); From 2bf503e4e3fe1f8fdae636fbbd460d9b2501725e Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 2 Dec 2024 09:36:05 +0100 Subject: [PATCH 53/68] Fix local in eq distribution --- examples/mp/local_issue.cpp | 73 ------------------- .../matrix_formats/csr_eq_distribution.hpp | 57 +++++++++++++++ .../matrix_formats/csr_eq_segment.hpp | 30 +------- include/dr/mp/sycl_support.hpp | 2 +- test/gtest/mp/reduce.cpp | 27 +++---- 5 files changed, 73 insertions(+), 116 deletions(-) delete mode 100644 examples/mp/local_issue.cpp diff --git a/examples/mp/local_issue.cpp b/examples/mp/local_issue.cpp deleted file mode 100644 index c4619c1ed7..0000000000 --- a/examples/mp/local_issue.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-FileCopyrightText: Intel Corporation -// -// SPDX-License-Identifier: BSD-3-Clause - -#include -#include - -namespace mp = dr::mp; - -int main(int argc, char **argv) { - -#ifdef SYCL_LANGUAGE_VERSION - mp::init(sycl::default_selector_v); -#else - mp::init(); -#endif - - dr::views::csr_matrix_view local_data; - auto root = 0; - if (root == dr::mp::default_comm().rank()) { - local_data = dr::generate_band_csr(10, 0, 1); - } - { - mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - m_row(local_data, root); - - auto mapper = [] (auto elem) { auto [a, b] = elem; auto [c, d] = a; return c;}; - auto z2 = dr::transform_view(m_row, mapper); - for (auto x: local_segments(m_row)) { - for (auto z : x) - { - auto [a, b] = z; - auto [c, d] = a; - fmt::print("some res {} {} {}\n", b, c, d); - } - } - - auto q = dr::mp::sycl_queue(); - auto sum1 = sycl::malloc_shared(1, q); - auto sum2 = sycl::malloc_shared(1, q); - auto sum3 = sycl::malloc_shared(1, q); - auto local_iter = local_segments(m_row); - for (auto x: local_iter) { - q.submit([=](auto &&h) { - h.parallel_for(sycl::nd_range<1>(1,1), [=](auto item) { - for (auto z : x) - { - auto [a, b] = z; - auto [c, d] = a; - sum1[0] = sum1[0] + b; - sum2[0] = sum2[0] + c; - sum3[0] = sum3[0] + d; - } - - }); - }).wait(); - fmt::print("iter vals {} {} {}\n", sum1[0], sum2[0], sum3[0]); - } - - auto summer = [](auto x, auto y) { return x + y;}; - auto red2 = dr::mp::reduce(z2, 0, summer); - fmt::print("reduced row {} {}\n", red2, m_row.size()); - } - - if (root == dr::mp::default_comm().rank()) { - dr::__detail::destroy_csr_matrix_view(local_data, std::allocator{}); - } - mp::finalize(); - - return 0; -} diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index aa969c773b..c9df25f437 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -5,11 +5,13 @@ #include #include #include +#include namespace dr::mp { template class csr_eq_distribution { + using view_tuple = std::tuple; public: using value_type = dr::matrix_entry; using segment_type = csr_eq_segment; @@ -32,6 +34,7 @@ class csr_eq_distribution { fence(); if (rows_data_ != nullptr) { rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type)); + tuple_alloc.deallocate(view_helper_const, 1); } } } @@ -298,8 +301,62 @@ class csr_eq_distribution { std::min(segment_size_, nnz_ - i), segment_size_); } fence(); + auto local_rows = rows_data_; + auto real_val_size = std::min(vals_data_->segment_size(), nnz_ - vals_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(row_size_, row_offsets_[rank], segment_size_ * rank, local_rows); + view_helper_const = tuple_alloc.allocate(1); + + + if (use_sycl()) { + sycl_queue().memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)).wait(); + } else { + view_helper_const[0] = my_tuple; + } + + auto local_cols = cols_data_->segments()[rank].begin().local(); + auto local_vals = vals_data_->segments()[rank].begin().local(); + local_view = std::make_shared(get_elem_view(real_val_size, view_helper_const, local_cols, local_vals, rank)); } + + static auto get_elem_view( + std::size_t vals_size, + view_tuple* helper_tuple, + index_type *local_cols, + elem_type *local_vals, + std::size_t rank) { + auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); + auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); + auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); + auto enumerated_zipped = rng::views::enumerate(zipped_results); + // we need to use multiply_view here, + // because lambda is not properly copied to sycl environment + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view(rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = rng::views::zip(enumerated_zipped, multiply_range); + + auto transformer = [=](auto x) { + auto [entry, tuple] = x; + auto [row_size, row_offset, offset, local_rows] = tuple; + auto [index, pair] = entry; + auto [val, column] = pair; + auto row = rng::distance( + local_rows, + std::upper_bound(local_rows, local_rows + row_size, offset + index) - + 1) + row_offset; + dr::index index_obj(row, column); + value_type entry_obj(index_obj, val); + return entry_obj; + }; + return rng::transform_view(enumerted_with_data, std::move(transformer)); + } + + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr,0)); + + dr::mp::__detail::allocator tuple_alloc; + view_tuple* view_helper_const; + std::shared_ptr local_view; + std::size_t segment_size_ = 0; std::size_t row_size_ = 0; std::size_t max_row_size_ = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index f0aa7425e5..ebe9d79a3e 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -238,36 +238,8 @@ template class csr_eq_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->rows_backend_.getrank(); - assert(my_process_segment_index == segment_index_); - // const auto offset = dsm_->row_offsets_[segment_index_]; - // assert(offset == 0); - // auto row_size = dsm_->row_size_; - auto segment_size = dsm_->vals_data_->segment_size(); - auto local_vals = dsm_->vals_data_->segments()[segment_index_].begin().local(); - auto local_vals_range = rng::subrange(local_vals, local_vals + segment_size); - auto local_cols = dsm_->cols_data_->segments()[segment_index_].begin().local(); - auto local_cols_range = rng::subrange(local_cols, local_cols + segment_size); - // auto local_rows = dsm_->rows_data_; - auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); - auto enumerated_zipped = rng::views::enumerate(zipped_results); - auto transformer = [=](auto entry) { - // assert(offset == 0); - auto [index, pair] = entry; - auto [val, column] = pair; - auto row = 0; //TODO fix calculating row - it results in segfault - // problem originates from the fact that variables cannot be caputed properly by value - // auto row = rng::distance( - // local_rows, - // std::upper_bound(local_rows, local_rows + row_size, offset) - - // 1); - dr::index index_obj(row, column); - value_type entry_obj(index_obj, val); - return entry_obj; - }; - auto transformed_res = rng::views::transform(enumerated_zipped, transformer); - // static_assert(std::is_same::value); - return transformed_res.begin(); + return dsm_->local_view->begin(); } private: diff --git a/include/dr/mp/sycl_support.hpp b/include/dr/mp/sycl_support.hpp index 768b3ae795..be0275bdfe 100644 --- a/include/dr/mp/sycl_support.hpp +++ b/include/dr/mp/sycl_support.hpp @@ -19,7 +19,7 @@ namespace dr::mp::__detail { //sometimes we only want to dereference iterator inside SYCL template auto sycl_get_deref(T v) { - using deref_type = decltype(*v); + using deref_type = std::remove_reference::type; deref_type temp; { sycl::buffer buff(&temp, 1); diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp index e663188bbd..b7577c3fee 100644 --- a/test/gtest/mp/reduce.cpp +++ b/test/gtest/mp/reduce.cpp @@ -38,16 +38,17 @@ TYPED_TEST(ReduceMP, RootIterators) { } } -// Example of code that should be compiling, but does not, described in issue -// DRA-192 TYPED_TEST(ReduceMP, NotCompiling) { -// dr::mp::distributed_vector r1(10); - -// auto add = [](auto &&elem) { -// return elem + 1; -// }; - -// auto added = dr::mp::views::transform(r1, add); -// auto min = [](double x, double y) { return std::min(x, y); }; -// auto result = dr::mp::reduce(root, added, 1, min); -// EXPECT_EQ(result, 1); -// } +TYPED_TEST(ReduceMP, TransformReduce) { + Ops1 ops(10); + + auto add = [](auto &&elem) { + return elem + 1; + }; + + auto added = dr::mp::views::transform(ops.dist_vec, add); + auto min = [](double x, double y) { return std::min(x, y); }; + auto result = dr::mp::reduce(root, added, 1, min); + if (comm_rank == root) { + EXPECT_EQ(result, 1); + } +} From dc89bc8324735ee976a79910f0a54f2d0065e7e7 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 2 Dec 2024 09:38:01 +0100 Subject: [PATCH 54/68] Fix formatting --- examples/mp/CMakeLists.txt | 1 - include/dr/detail/multiply_view.hpp | 32 +++++------ .../matrix_formats/csr_eq_distribution.hpp | 55 ++++++++++--------- .../matrix_formats/csr_eq_segment.hpp | 2 +- .../matrix_formats/csr_row_distribution.hpp | 55 +++++++++++-------- .../matrix_formats/csr_row_segment.hpp | 4 +- include/dr/mp/sycl_support.hpp | 12 ++-- test/gtest/mp/reduce.cpp | 4 +- 8 files changed, 86 insertions(+), 79 deletions(-) diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 1af7dbc45d..54e9db9bd0 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -34,7 +34,6 @@ add_mp_example(hello_world) add_mp_example_no_test(sparse_matrix) add_mp_example_no_test(sparse_benchmark) add_mp_example_no_test(sparse_matrix_matrix_mul) -add_mp_example_no_test(local_issue) if(OpenMP_FOUND) add_executable(vector-add-ref vector-add-ref.cpp) diff --git a/include/dr/detail/multiply_view.hpp b/include/dr/detail/multiply_view.hpp index 174f7c2c24..28d124ba50 100644 --- a/include/dr/detail/multiply_view.hpp +++ b/include/dr/detail/multiply_view.hpp @@ -13,8 +13,7 @@ namespace dr::__detail { -template -class multiply_iterator { +template class multiply_iterator { public: using value_type = std::iter_value_t; using difference_type = long long; @@ -25,7 +24,8 @@ class multiply_iterator { using iterator_category = std::random_access_iterator_tag; - multiply_iterator(Iter iter, std::size_t len, long long pos) noexcept : iter_(iter), len_(len), pos_(pos) {} + multiply_iterator(Iter iter, std::size_t len, long long pos) noexcept + : iter_(iter), len_(len), pos_(pos) {} multiply_iterator() noexcept = default; ~multiply_iterator() noexcept = default; multiply_iterator(const multiply_iterator &) noexcept = default; @@ -55,13 +55,9 @@ class multiply_iterator { bool operator>(iterator other) const noexcept { return pos_ > other.pos_; } - bool operator<=(iterator other) const noexcept { - return pos_ <= other.pos_; - } + bool operator<=(iterator other) const noexcept { return pos_ <= other.pos_; } - bool operator>=(iterator other) const noexcept { - return pos_ >= other.pos_; - } + bool operator>=(iterator other) const noexcept { return pos_ >= other.pos_; } iterator &operator++() noexcept { ++pos_; @@ -119,22 +115,24 @@ class multiply_iterator { }; template - requires(rng::sized_range) + requires(rng::sized_range) class multiply_view : public rng::view_interface> { public: template multiply_view(R &&r, std::size_t n) : base_(rng::views::all(std::forward(r))), n_(n) {} - auto begin() const { return multiply_iterator(rng::begin(base_), base_.size(), 0); } - - auto end() const { return multiply_iterator(rng::begin(base_), base_.size(), n_ * base_.size()); } + auto begin() const { + return multiply_iterator(rng::begin(base_), base_.size(), 0); + } - auto size() const - { - return rng::size(base_); + auto end() const { + return multiply_iterator(rng::begin(base_), base_.size(), + n_ * base_.size()); } + auto size() const { return rng::size(base_); } + private: V base_; std::size_t n_; @@ -143,4 +141,4 @@ class multiply_view : public rng::view_interface> { template multiply_view(R &&r, std::size_t n) -> multiply_view>; -} // namespace dr +} // namespace dr::__detail diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index c9df25f437..afc4cb5ff5 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -3,15 +3,16 @@ // SPDX-License-Identifier: BSD-3-Clause #pragma once #include +#include #include #include -#include namespace dr::mp { template class csr_eq_distribution { - using view_tuple = std::tuple; + using view_tuple = std::tuple; + public: using value_type = dr::matrix_entry; using segment_type = csr_eq_segment; @@ -302,48 +303,52 @@ class csr_eq_distribution { } fence(); auto local_rows = rows_data_; - auto real_val_size = std::min(vals_data_->segment_size(), nnz_ - vals_data_->segment_size() * rank); - auto my_tuple = std::make_tuple(row_size_, row_offsets_[rank], segment_size_ * rank, local_rows); + auto real_val_size = std::min(vals_data_->segment_size(), + nnz_ - vals_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(row_size_, row_offsets_[rank], + segment_size_ * rank, local_rows); view_helper_const = tuple_alloc.allocate(1); - if (use_sycl()) { - sycl_queue().memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)).wait(); + sycl_queue() + .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)) + .wait(); } else { view_helper_const[0] = my_tuple; } auto local_cols = cols_data_->segments()[rank].begin().local(); auto local_vals = vals_data_->segments()[rank].begin().local(); - local_view = std::make_shared(get_elem_view(real_val_size, view_helper_const, local_cols, local_vals, rank)); + local_view = std::make_shared(get_elem_view( + real_val_size, view_helper_const, local_cols, local_vals, rank)); } - - static auto get_elem_view( - std::size_t vals_size, - view_tuple* helper_tuple, - index_type *local_cols, - elem_type *local_vals, - std::size_t rank) { + static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, + index_type *local_cols, elem_type *local_vals, + std::size_t rank) { auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); - // we need to use multiply_view here, + // we need to use multiply_view here, // because lambda is not properly copied to sycl environment - // when we use variable capture - auto multiply_range = dr::__detail::multiply_view(rng::subrange(helper_tuple, helper_tuple + 1), vals_size); - auto enumerted_with_data = rng::views::zip(enumerated_zipped, multiply_range); - + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view( + rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = + rng::views::zip(enumerated_zipped, multiply_range); + auto transformer = [=](auto x) { auto [entry, tuple] = x; auto [row_size, row_offset, offset, local_rows] = tuple; auto [index, pair] = entry; auto [val, column] = pair; - auto row = rng::distance( - local_rows, - std::upper_bound(local_rows, local_rows + row_size, offset + index) - - 1) + row_offset; + auto row = + rng::distance(local_rows, + std::upper_bound(local_rows, local_rows + row_size, + offset + index) - + 1) + + row_offset; dr::index index_obj(row, column); value_type entry_obj(index_obj, val); return entry_obj; @@ -351,10 +356,10 @@ class csr_eq_distribution { return rng::transform_view(enumerted_with_data, std::move(transformer)); } - using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr,0)); + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0)); dr::mp::__detail::allocator tuple_alloc; - view_tuple* view_helper_const; + view_tuple *view_helper_const; std::shared_ptr local_view; std::size_t segment_size_ = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index ebe9d79a3e..175fcd4d69 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -235,7 +235,7 @@ template class csr_eq_segment_iterator { return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); } - + auto local() const { const auto my_process_segment_index = dsm_->rows_backend_.getrank(); assert(my_process_segment_index == segment_index_); diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 5160674a5b..d179f80a8d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -3,15 +3,16 @@ // SPDX-License-Identifier: BSD-3-Clause #pragma once #include +#include #include #include -#include #include namespace dr::mp { template class csr_row_distribution { - using view_tuple = std::tuple; + using view_tuple = std::tuple; + public: using value_type = dr::matrix_entry; using segment_type = csr_row_segment; @@ -291,45 +292,51 @@ class csr_row_distribution { fence(); auto local_rows = rows_data_->segments()[rank].begin().local(); auto offset = val_offsets_[rank]; - auto real_row_size = std::min(rows_data_->segment_size(), shape_.first - rows_data_->segment_size() * rank); - auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset, local_rows); + auto real_row_size = + std::min(rows_data_->segment_size(), + shape_.first - rows_data_->segment_size() * rank); + auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset, + local_rows); view_helper_const = alloc.allocate(1); - if (use_sycl()) { - sycl_queue().memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)).wait(); + sycl_queue() + .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple)) + .wait(); } else { view_helper_const[0] = my_tuple; } - local_view = std::make_shared(get_elem_view(vals_size_, view_helper_const, cols_data_, vals_data_, rank)); + local_view = std::make_shared(get_elem_view( + vals_size_, view_helper_const, cols_data_, vals_data_, rank)); } - static auto get_elem_view( - std::size_t vals_size, - view_tuple* helper_tuple, - index_type *local_cols, - elem_type *local_vals, - std::size_t rank) { + static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, + index_type *local_cols, elem_type *local_vals, + std::size_t rank) { auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size); auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size); auto zipped_results = rng::views::zip(local_vals_range, local_cols_range); auto enumerated_zipped = rng::views::enumerate(zipped_results); - // we need to use multiply_view here, + // we need to use multiply_view here, // because lambda is not properly copied to sycl environment - // when we use variable capture - auto multiply_range = dr::__detail::multiply_view(rng::subrange(helper_tuple, helper_tuple + 1), vals_size); - auto enumerted_with_data = rng::views::zip(enumerated_zipped, multiply_range); - + // when we use variable capture + auto multiply_range = dr::__detail::multiply_view( + rng::subrange(helper_tuple, helper_tuple + 1), vals_size); + auto enumerted_with_data = + rng::views::zip(enumerated_zipped, multiply_range); + auto transformer = [=](auto x) { auto [entry, tuple] = x; auto [row_size, row_offset, offset, local_rows] = tuple; auto [index, pair] = entry; auto [val, column] = pair; - auto row = rng::distance( - local_rows, - std::upper_bound(local_rows, local_rows + row_size, offset + index) - - 1) + row_offset; + auto row = + rng::distance(local_rows, + std::upper_bound(local_rows, local_rows + row_size, + offset + index) - + 1) + + row_offset; dr::index index_obj(row, column); value_type entry_obj(index_obj, val); return entry_obj; @@ -337,10 +344,10 @@ class csr_row_distribution { return rng::transform_view(enumerted_with_data, std::move(transformer)); } - using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr,0)); + using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0)); dr::mp::__detail::allocator alloc; - view_tuple* view_helper_const; + view_tuple *view_helper_const; std::shared_ptr local_view; std::size_t segment_size_ = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index 22b8da5737..1f95986619 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -4,7 +4,6 @@ #pragma once - namespace dr::mp { template class csr_row_segment_iterator; @@ -223,7 +222,7 @@ template class csr_row_segment_iterator { assert(dsm_ != nullptr); return dr::__detail::drop_segments(dsm_->segments(), segment_index_, index_); - } + } auto local() const { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); @@ -232,7 +231,6 @@ template class csr_row_segment_iterator { } private: - // all fields need to be initialized by default ctor so every default // constructed iter is equal to any other default constructed iter DSM *dsm_ = nullptr; diff --git a/include/dr/mp/sycl_support.hpp b/include/dr/mp/sycl_support.hpp index be0275bdfe..33d34159a8 100644 --- a/include/dr/mp/sycl_support.hpp +++ b/include/dr/mp/sycl_support.hpp @@ -17,16 +17,18 @@ sycl::queue &sycl_queue(); namespace dr::mp::__detail { -//sometimes we only want to dereference iterator inside SYCL +// sometimes we only want to dereference iterator inside SYCL template auto sycl_get_deref(T v) { using deref_type = std::remove_reference::type; deref_type temp; { sycl::buffer buff(&temp, 1); - sycl_queue().submit([&](auto &&h) { - sycl::accessor access(buff, h, sycl::write_only, sycl::no_init); - h.single_task([=](auto i) { access[0] = *v;}); - }).wait(); + sycl_queue() + .submit([&](auto &&h) { + sycl::accessor access(buff, h, sycl::write_only, sycl::no_init); + h.single_task([=](auto i) { access[0] = *v; }); + }) + .wait(); } return temp; } diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp index b7577c3fee..c7a00d7323 100644 --- a/test/gtest/mp/reduce.cpp +++ b/test/gtest/mp/reduce.cpp @@ -41,9 +41,7 @@ TYPED_TEST(ReduceMP, RootIterators) { TYPED_TEST(ReduceMP, TransformReduce) { Ops1 ops(10); - auto add = [](auto &&elem) { - return elem + 1; - }; + auto add = [](auto &&elem) { return elem + 1; }; auto added = dr::mp::views::transform(ops.dist_vec, add); auto min = [](double x, double y) { return std::min(x, y); }; From 7e7f2d21a86dd9c65ff6e9a204d9629083d64a59 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 2 Dec 2024 10:46:35 +0100 Subject: [PATCH 55/68] Reverse change in dr::transform_view --- include/dr/views/transform.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dr/views/transform.hpp b/include/dr/views/transform.hpp index 9c6598c69e..a379f11f90 100644 --- a/include/dr/views/transform.hpp +++ b/include/dr/views/transform.hpp @@ -109,7 +109,7 @@ class transform_iterator { requires(dr::ranges::__detail::has_local) { auto iter = dr::ranges::__detail::local(iter_); - return transform_iterator(std::move(iter), fn_); + return transform_iterator(iter, fn_); } private: From dd1d6ed8733e59897d4fb1bf1aabdcee48041e96 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 2 Dec 2024 13:56:37 +0100 Subject: [PATCH 56/68] Fix benchmark when default vector size is small --- benchmarks/gbench/mp/gemv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 514e21e5ac..9be4f3efa3 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -147,7 +147,7 @@ std::size_t getWidth() { } } // namespace static auto getMatrix() { - std::size_t n = std::sqrt(default_vector_size / 100000) * 50000; + std::size_t n = std::max(1., std::sqrt(default_vector_size / 100000)) * 50000; // std::size_t n = default_vector_size / 2; std::size_t up = n / 50; std::size_t down = n / 50; From e42cfa22eb31fc1ba1a92f7b2d82ebaee1fa73aa Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 3 Dec 2024 17:00:14 +0100 Subject: [PATCH 57/68] Fix issue when distributed vector is too small --- .../matrix_formats/csr_eq_distribution.hpp | 16 ++++++++++------ .../containers/matrix_formats/csr_eq_segment.hpp | 3 +++ .../matrix_formats/csr_row_distribution.hpp | 16 +++++++++++----- .../matrix_formats/csr_row_segment.hpp | 3 +++ 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index afc4cb5ff5..38f10688af 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -301,7 +301,6 @@ class csr_eq_distribution { segments_.emplace_back(this, segment_index++, std::min(segment_size_, nnz_ - i), segment_size_); } - fence(); auto local_rows = rows_data_; auto real_val_size = std::min(vals_data_->segment_size(), nnz_ - vals_data_->segment_size() * rank); @@ -317,10 +316,15 @@ class csr_eq_distribution { view_helper_const[0] = my_tuple; } - auto local_cols = cols_data_->segments()[rank].begin().local(); - auto local_vals = vals_data_->segments()[rank].begin().local(); - local_view = std::make_shared(get_elem_view( - real_val_size, view_helper_const, local_cols, local_vals, rank)); + auto local_cols = static_cast(nullptr); + auto local_vals = static_cast(nullptr); + if (cols_data_->segments().size() > rank) { + local_cols = cols_data_->segments()[rank].begin().local(); + local_vals = vals_data_->segments()[rank].begin().local(); + local_view = std::make_shared(get_elem_view( + real_val_size, view_helper_const, local_cols, local_vals, rank)); + } + fence(); } static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, @@ -360,7 +364,7 @@ class csr_eq_distribution { dr::mp::__detail::allocator tuple_alloc; view_tuple *view_helper_const; - std::shared_ptr local_view; + std::shared_ptr local_view = nullptr; std::size_t segment_size_ = 0; std::size_t row_size_ = 0; diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index 175fcd4d69..bc4382810e 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -239,6 +239,9 @@ template class csr_eq_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->rows_backend_.getrank(); assert(my_process_segment_index == segment_index_); + if (dsm_->local_view == nullptr) { + return nullptr; + } return dsm_->local_view->begin(); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index d179f80a8d..569ccd4b6d 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -289,8 +289,11 @@ class csr_row_distribution { this, segment_index++, val_sizes_[i], std::max(val_sizes_[i], static_cast(1))); } - fence(); - auto local_rows = rows_data_->segments()[rank].begin().local(); + + auto local_rows = static_cast(nullptr); + if (rows_data_->segments().size() > rank) { + local_rows = rows_data_->segments()[rank].begin().local(); + } auto offset = val_offsets_[rank]; auto real_row_size = std::min(rows_data_->segment_size(), @@ -307,8 +310,11 @@ class csr_row_distribution { view_helper_const[0] = my_tuple; } - local_view = std::make_shared(get_elem_view( - vals_size_, view_helper_const, cols_data_, vals_data_, rank)); + if (rows_data_->segments().size() > rank) { + local_view = std::make_shared(get_elem_view( + vals_size_, view_helper_const, cols_data_, vals_data_, rank)); + } + fence(); } static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple, @@ -365,6 +371,6 @@ class csr_row_distribution { dr::index shape_; std::size_t nnz_; std::vector segments_; - std::shared_ptr> rows_data_; + std::shared_ptr> rows_data_ = nullptr; }; } // namespace dr::mp diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index 1f95986619..ad95aa17ec 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -227,6 +227,9 @@ template class csr_row_segment_iterator { auto local() const { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); assert(my_process_segment_index == segment_index_); + if (dsm_->local_view == nullptr) { + return nullptr; + } return dsm_->local_view->begin(); } From 2318a46c0501400db224361fe60cf505a68a8363 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 3 Dec 2024 17:33:01 +0100 Subject: [PATCH 58/68] Improve performance of eq distribution gather --- .../matrix_formats/csr_eq_distribution.hpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index 38f10688af..f3673f1d89 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -190,11 +190,22 @@ class csr_eq_distribution { auto first_row = row_offsets_[i]; auto last_row = row_offsets_[i] + row_sizes_[i]; auto row_size = row_sizes_[i]; - for (auto j = first_row; j < last_row; j++) { - res[j + k * shape_[0]] += - gathered_res_host[vals_width * current_offset + k * row_size + - j - first_row]; + if (first_row < last_row) { + res[first_row + k * shape_[0]] += + gathered_res_host[vals_width * current_offset + k * row_size]; } + if (first_row < last_row - 1) { + auto piece_start = gathered_res_host + vals_width * current_offset + + k * row_size + 1; + std::copy(piece_start, piece_start + last_row - first_row - 1, + res.begin() + first_row + k * shape_[0] + 1); + } + // for (auto j = first_row; j < last_row; j++) { + // res[j + k * shape_[0]] += + // gathered_res_host[vals_width * current_offset + k * row_size + // + + // j - first_row]; + // } current_offset += row_sizes_[i]; } } From 4cfb11016ae4788136f4edc87b2b807bd7864432 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Tue, 3 Dec 2024 18:13:42 +0100 Subject: [PATCH 59/68] Remove unneccessary comment --- .../dr/mp/containers/matrix_formats/csr_eq_distribution.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp index f3673f1d89..aeb0461115 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp @@ -200,12 +200,6 @@ class csr_eq_distribution { std::copy(piece_start, piece_start + last_row - first_row - 1, res.begin() + first_row + k * shape_[0] + 1); } - // for (auto j = first_row; j < last_row; j++) { - // res[j + k * shape_[0]] += - // gathered_res_host[vals_width * current_offset + k * row_size - // + - // j - first_row]; - // } current_offset += row_sizes_[i]; } } From 04191d768e7dfa84b7f04e65308a66c17a7daede Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 9 Dec 2024 10:32:02 +0100 Subject: [PATCH 60/68] Add test for reduce and fix type error in sparse matrix local --- .../matrix_formats/csr_eq_segment.hpp | 2 +- .../matrix_formats/csr_row_segment.hpp | 2 +- test/gtest/mp/sparse_matrix.cpp | 39 +++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp index bc4382810e..2caa8af843 100644 --- a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp @@ -240,7 +240,7 @@ template class csr_eq_segment_iterator { const auto my_process_segment_index = dsm_->rows_backend_.getrank(); assert(my_process_segment_index == segment_index_); if (dsm_->local_view == nullptr) { - return nullptr; + throw std::runtime_error("Requesting not existing local segment"); } return dsm_->local_view->begin(); } diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp index ad95aa17ec..ce0f627e3e 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp @@ -228,7 +228,7 @@ template class csr_row_segment_iterator { const auto my_process_segment_index = dsm_->vals_backend_.getrank(); assert(my_process_segment_index == segment_index_); if (dsm_->local_view == nullptr) { - return nullptr; + throw std::runtime_error("Requesting not existing local segment"); } return dsm_->local_view->begin(); } diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp index 133739d730..0f98b0cb48 100644 --- a/test/gtest/mp/sparse_matrix.cpp +++ b/test/gtest/mp/sparse_matrix.cpp @@ -14,6 +14,23 @@ auto testMatrixIter(auto &src, auto &matrix) { } } +auto testMatrixReduce(auto &src, auto &matrix) { + EXPECT_TRUE(src.size() == matrix.size()); + long sum = 0; + for (auto [index, val] : src) { + auto [x, y] = index; + sum += (long)(val + x + y); + } + auto transformer = [](auto entry) { + auto [index, val] = entry; + auto [x, y] = index; + return (long)(val + x + y); + }; + auto transformed = dr::transform_view(matrix, transformer); + long reduced = dr::mp::reduce(transformed, 0, std::plus{}); + EXPECT_TRUE((sum == reduced)); +} + TEST(SparseMatrix, staticAssertEq) { std::size_t m = 100; std::size_t k = 100; @@ -73,3 +90,25 @@ TEST(SparseMatrix, IterEq) { a(csr, 0); testMatrixIter(csr, a); } + +TEST(SparseMatrix, ReduceRow) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_row_distribution> + a(csr, 0); + testMatrixReduce(csr, a); +} + +TEST(SparseMatrix, ReduceEq) { + std::size_t m = 100; + std::size_t k = 100; + auto csr = dr::generate_random_csr({m, k}, 0.1f); + dr::mp::distributed_sparse_matrix< + float, unsigned long, dr::mp::MpiBackend, + dr::mp::csr_eq_distribution> + a(csr, 0); + testMatrixReduce(csr, a); +} From adad4f7bd1f148d90caf9212a152edfd62c87537 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 9 Dec 2024 11:48:21 +0100 Subject: [PATCH 61/68] Add broadcast_vector tests --- test/gtest/mp/broadcasted_vector.cpp | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 test/gtest/mp/broadcasted_vector.cpp diff --git a/test/gtest/mp/broadcasted_vector.cpp b/test/gtest/mp/broadcasted_vector.cpp new file mode 100644 index 0000000000..55da9fedf1 --- /dev/null +++ b/test/gtest/mp/broadcasted_vector.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +TEST(BroadcastedVector, BroadcastData) { + std::size_t n = 100; + auto rank = dr::mp::default_comm().rank(); + std::vector data(n); + if (rank == 0) { + for (int i = 0; i < n; i++) { + data[i] = i; + } + } + dr::mp::broadcasted_vector broadcasted; + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, + dr::mp::default_comm()); + } + else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + + std::vector ref(n); + for (int i = 0; i < n; i++) { + ref[i] = i; + } + + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + broadcasted.destroy_data(); +} + +TEST(BroadcastedVector, BroadcastDataReuse) { + std::size_t n = 100; + auto rank = dr::mp::default_comm().rank(); + std::vector data(n); + if (rank == 0) { + for (int i = 0; i < n; i++) { + data[i] = i; + } + } + dr::mp::broadcasted_vector broadcasted; + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, + dr::mp::default_comm()); + } + else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + + std::vector ref(n); + for (int i = 0; i < n; i++) { + ref[i] = i; + } + + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + broadcasted.destroy_data(); + EXPECT_EQ(broadcasted.broadcasted_data(), nullptr); + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, + dr::mp::default_comm()); + } + else { + broadcasted.broadcast_data(n, 0, rng::empty_view(), + dr::mp::default_comm()); + } + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + broadcasted.destroy_data(); +} \ No newline at end of file From f17243bc1d873c248f89503f0f8d88daa2f49be9 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 9 Dec 2024 11:57:09 +0100 Subject: [PATCH 62/68] Fix formatting --- test/gtest/mp/broadcasted_vector.cpp | 50 ++++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test/gtest/mp/broadcasted_vector.cpp b/test/gtest/mp/broadcasted_vector.cpp index 55da9fedf1..a8e27289f0 100644 --- a/test/gtest/mp/broadcasted_vector.cpp +++ b/test/gtest/mp/broadcasted_vector.cpp @@ -7,28 +7,28 @@ TEST(BroadcastedVector, BroadcastData) { std::size_t n = 100; auto rank = dr::mp::default_comm().rank(); - std::vector data(n); + std::vector data(n); if (rank == 0) { for (int i = 0; i < n; i++) { - data[i] = i; + data[i] = i; } } dr::mp::broadcasted_vector broadcasted; if (rank == 0) { - broadcasted.broadcast_data(n, 0, data, - dr::mp::default_comm()); - } - else { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { broadcasted.broadcast_data(n, 0, rng::empty_view(), - dr::mp::default_comm()); + dr::mp::default_comm()); } - + std::vector ref(n); for (int i = 0; i < n; i++) { ref[i] = i; } - EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); broadcasted.destroy_data(); } @@ -38,35 +38,35 @@ TEST(BroadcastedVector, BroadcastDataReuse) { std::vector data(n); if (rank == 0) { for (int i = 0; i < n; i++) { - data[i] = i; + data[i] = i; } } dr::mp::broadcasted_vector broadcasted; if (rank == 0) { - broadcasted.broadcast_data(n, 0, data, - dr::mp::default_comm()); - } - else { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { broadcasted.broadcast_data(n, 0, rng::empty_view(), - dr::mp::default_comm()); + dr::mp::default_comm()); } - + std::vector ref(n); for (int i = 0; i < n; i++) { ref[i] = i; } - EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); broadcasted.destroy_data(); EXPECT_EQ(broadcasted.broadcasted_data(), nullptr); - if (rank == 0) { - broadcasted.broadcast_data(n, 0, data, - dr::mp::default_comm()); - } - else { + if (rank == 0) { + broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm()); + } else { broadcasted.broadcast_data(n, 0, rng::empty_view(), - dr::mp::default_comm()); + dr::mp::default_comm()); } - EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), broadcasted.broadcasted_data() + n), ref); + EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(), + broadcasted.broadcasted_data() + n), + ref); broadcasted.destroy_data(); -} \ No newline at end of file +} From f1639b0bff127e32ba121f7446ec4bdd5b65fefc Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 13 Jan 2025 11:26:48 +0100 Subject: [PATCH 63/68] Corrected gemv matrix creation --- benchmarks/gbench/mp/gemv.cpp | 142 ++++------------------------------ 1 file changed, 13 insertions(+), 129 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 9be4f3efa3..3da9d73e99 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -25,141 +25,26 @@ int comm_size; namespace mp = dr::mp; -#ifdef STANDALONE_BENCHMARK -int main(int argc, char **argv) { - - MPI_Init(&argc, &argv); - comm = MPI_COMM_WORLD; - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); - - if (argc != 3 && argc != 5) { - fmt::print( - "usage: ./sparse_benchmark [test outcome dir] [matrix market file], or " - "./sparse_benchmark [test outcome dir] [number of rows] [number of " - "columns] [number of lower bands] [number of upper bands]\n"); - return 1; - } - -#ifdef SYCL_LANGUAGE_VERSION - sycl::queue q = dr::mp::select_queue(); - mp::init(q); -#else - mp::init(); -#endif - dr::views::csr_matrix_view local_data; - std::stringstream filenamestream; - auto root = 0; - auto computeSize = dr::mp::default_comm().size(); - if (root == dr::mp::default_comm().rank()) { - if (argc == 5) { - fmt::print("started loading\n"); - auto n = std::stoul(argv[2]); - auto up = std::stoul(argv[3]); - auto down = std::stoul(argv[4]); - local_data = dr::generate_band_csr(n, up, down); - filenamestream << "mp_band_" << computeSize << "_" << n << "_" - << up + down << "_" << local_data.size(); - fmt::print("finished loading\n"); - } else { - fmt::print("started loading\n"); - std::string fname(argv[2]); - std::filesystem::path p(argv[2]); - local_data = dr::read_csr(fname); - filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_" - << local_data.size(); - fmt::print("finished loading\n"); - } - } - std::string resname; - mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_eq_distribution> - m_eq(local_data, root); - mp::distributed_sparse_matrix< - double, long, dr::mp::MpiBackend, - dr::mp::csr_row_distribution> - m_row(local_data, root); - fmt::print("finished distribution\n"); - std::vector eq_duration; - std::vector row_duration; - - auto N = 10; - std::vector b; - b.reserve(m_row.shape().second); - std::vector res(m_row.shape().first); - for (auto i = 0; i < m_row.shape().second; i++) { - b.push_back(i); - } - - dr::mp::broadcasted_vector allocated_b; - allocated_b.broadcast_data(m_row.shape().second, 0, b, - dr::mp::default_comm()); - - fmt::print("started initial gemv distribution\n"); - gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work - - fmt::print("finished initial gemv distribution\n"); - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_eq, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - eq_duration.push_back(duration); - } - - gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work - for (auto i = 0; i < N; i++) { - auto begin = std::chrono::high_resolution_clock::now(); - gemv(0, res, m_row, allocated_b); - auto end = std::chrono::high_resolution_clock::now(); - double duration = std::chrono::duration(end - begin).count() * 1000; - row_duration.push_back(duration); - } - - if (root == dr::mp::default_comm().rank()) { - std::string tmp; - filenamestream >> tmp; - std::filesystem::path p(argv[1]); - p += tmp; - p += ".csv"; - std::ofstream write_stream(p.string()); - write_stream << eq_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << eq_duration[i]; - } - write_stream << "\n"; - write_stream << row_duration.front(); - for (auto i = 1; i < N; i++) { - write_stream << "," << row_duration[i]; - } - write_stream << "\n"; - } - allocated_b.destroy_data(); - mp::finalize(); -} - -#else - namespace { std::size_t getWidth() { return 8; // default_vector_size / 100000; } } // namespace static auto getMatrix() { - std::size_t n = std::max(1., std::sqrt(default_vector_size / 100000)) * 50000; - // std::size_t n = default_vector_size / 2; - std::size_t up = n / 50; - std::size_t down = n / 50; - // assert(dr::mp::use_sycl()); - // assert(dr::mp::sycl_mem_kind() == sycl::usm::alloc::device); - return dr::generate_band_csr(n, up, down); + // size below is useful when testing weak scaling with default vector size using dr-bench + // it creates matrix which non-zero element count increases linearly when we increase default_vector_size + // std::size_t n = std::max(1., std::sqrt(default_vector_size / 100000)) * 50000; + + std::size_t density_scalar = 50; - // return dr::read_csr("/home/komarmik/examples/soc-LiveJournal1.mtx"); return - // dr::read_csr("/home/komarmik/examples/mycielskian18.mtx"); - // return dr::read_csr("/home/komarmik/examples/mawi_201512020030.mtx"); + std::size_t n = std::max(1., std::sqrt(default_vector_size * density_scalar / 2)); + + std::size_t up = n / density_scalar; + std::size_t down = n / density_scalar; + fmt::print("Generate matrix"); + auto tmp = dr::generate_band_csr(n, up, down); + fmt::print("generated!"); + return tmp; } static void GemvEq_DR(benchmark::State &state) { @@ -315,4 +200,3 @@ DR_BENCHMARK(GemvEq_Reference); DR_BENCHMARK(GemvRow_Reference); -#endif From 3dfdac02e9f9af3dc85cecd6110cc373f60b7e68 Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Mon, 13 Jan 2025 11:28:00 +0100 Subject: [PATCH 64/68] Fix formatting --- benchmarks/gbench/mp/gemv.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index 3da9d73e99..ee55146160 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -31,13 +31,15 @@ std::size_t getWidth() { } } // namespace static auto getMatrix() { - // size below is useful when testing weak scaling with default vector size using dr-bench - // it creates matrix which non-zero element count increases linearly when we increase default_vector_size - // std::size_t n = std::max(1., std::sqrt(default_vector_size / 100000)) * 50000; + // size below is useful when testing weak scaling with default vector size + // using dr-bench it creates matrix which non-zero element count increases + // linearly when we increase default_vector_size std::size_t n = std::max(1., + // std::sqrt(default_vector_size / 100000)) * 50000; std::size_t density_scalar = 50; - std::size_t n = std::max(1., std::sqrt(default_vector_size * density_scalar / 2)); + std::size_t n = + std::max(1., std::sqrt(default_vector_size * density_scalar / 2)); std::size_t up = n / density_scalar; std::size_t down = n / density_scalar; @@ -199,4 +201,3 @@ static void GemvRow_Reference(benchmark::State &state) { DR_BENCHMARK(GemvEq_Reference); DR_BENCHMARK(GemvRow_Reference); - From 818e848038d0f728d02e272ceaff261eb481f03c Mon Sep 17 00:00:00 2001 From: Mikolaj Komar Date: Fri, 17 Jan 2025 12:02:12 +0100 Subject: [PATCH 65/68] Fixed PR comments --- benchmarks/gbench/mp/gemv.cpp | 11 ----------- examples/mp/CMakeLists.txt | 10 +++++----- include/dr/detail/communicator.hpp | 15 +++++++-------- include/dr/detail/generate_csr.hpp | 6 ++++++ include/dr/detail/index.hpp | 1 - include/dr/detail/matrix_entry.hpp | 1 + include/dr/mp/algorithms/equal.hpp | 8 ++------ include/dr/mp/algorithms/reduce.hpp | 7 +++++++ .../matrix_formats/csr_row_distribution.hpp | 4 ++-- 9 files changed, 30 insertions(+), 33 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index ee55146160..baffcbd608 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -10,19 +10,8 @@ #include #include #include - -#ifdef STANDALONE_BENCHMARK - -MPI_Comm comm; -int comm_rank; -int comm_size; - -#else - #include "../common/dr_bench.hpp" -#endif - namespace mp = dr::mp; namespace { diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 54e9db9bd0..61ca9e509f 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -16,17 +16,17 @@ add_executable(vector-add vector-add.cpp) target_link_libraries(vector-add DR::mpi) add_mp_ctest(TEST_NAME vector-add NAME vector-add NPROC 2) -function(add_mp_example example_name) +function(add_mp_example_no_test example_name) add_executable(${example_name} ${example_name}.cpp) target_link_libraries(${example_name} cxxopts DR::mpi) - add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2) endfunction() -function(add_mp_example_no_test example_name) - add_executable(${example_name} ${example_name}.cpp) - target_link_libraries(${example_name} cxxopts DR::mpi) +function(add_mp_example example_name) + add_mp_example_no_test(${example_name}) + add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2) endfunction() + add_mp_example(stencil-1d) add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 42bbe9c1a8..6b0c9dbdbf 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -61,18 +61,17 @@ class communicator { mpi_comm_); } - template - void gather_typed(const T *src, T *dst, std::size_t count, - std::size_t root) const { - MPI_Gather_c(src, count * sizeof(T), MPI_BYTE, dst, count * sizeof(T), - MPI_BYTE, root, mpi_comm_); - } - void gather(const void *src, void *dst, std::size_t count, std::size_t root) const { MPI_Gather_c(src, count, MPI_BYTE, dst, count, MPI_BYTE, root, mpi_comm_); } + template + void gather(const T *src, T *dst, std::size_t count, + std::size_t root) const { + gather((void*)src, (void*)dst, count * sizeof(T),root); + } + template void gather(const T &src, std::span dst, std::size_t root) const { assert(rng::size(dst) >= size_); @@ -112,7 +111,7 @@ class communicator { i_all_gather(&src, rng::data(dst), 1, req); } - void gatherv(const void *src, long long *counts, long *offsets, void *dst, + void gatherv(const void *src, MPI_Count *counts, MPI_Aint *offsets, void *dst, std::size_t root) const { MPI_Gatherv_c(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE, root, mpi_comm_); diff --git a/include/dr/detail/generate_csr.hpp b/include/dr/detail/generate_csr.hpp index 2917028b6b..c8ef7202a2 100644 --- a/include/dr/detail/generate_csr.hpp +++ b/include/dr/detail/generate_csr.hpp @@ -34,6 +34,9 @@ struct pair_hash { } // namespace +// it returns matrix view of randomly generated matrix +// the memory is owned by the view, so it needs to be released using +// destroy_csr_matrix_view template auto generate_random_csr(dr::index shape, double density = 0.01, unsigned int seed = 0) { @@ -99,6 +102,9 @@ auto generate_random_csr(dr::index shape, double density = 0.01, return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0); } +// it returns matrix view of band matrix +// the memory is owned by the view, so it needs to be released using +// destroy_csr_matrix_view template auto generate_band_csr(I size, std::size_t up_band = 3, std::size_t down_band = 3) { diff --git a/include/dr/detail/index.hpp b/include/dr/detail/index.hpp index f36e798363..3c496e8a57 100644 --- a/include/dr/detail/index.hpp +++ b/include/dr/detail/index.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include #include diff --git a/include/dr/detail/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp index bffe596654..10340ff4da 100644 --- a/include/dr/detail/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include diff --git a/include/dr/mp/algorithms/equal.hpp b/include/dr/mp/algorithms/equal.hpp index 3901a10483..0f4039cae7 100644 --- a/include/dr/mp/algorithms/equal.hpp +++ b/include/dr/mp/algorithms/equal.hpp @@ -26,12 +26,8 @@ bool equal(std::size_t root, bool root_provided, R1 &&r1, R2 &&r2) { }; auto zipped_views = views::zip(r1, r2); - - // we are using mp::transform instead of mp::views::transform due to - // compilation error refer to DRA-192 and test/gtest/mp/reduce.cpp - mp::distributed_vector compared(rng::distance(r1)); - mp::transform(zipped_views, compared.begin(), compare); - + auto compared = dr::mp::views::transform(zipped_views, compare); + auto min = [](double x, double y) { return std::min(x, y); }; if (root_provided) { auto result = mp::reduce(root, compared, 1, min); diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp index 21d5fcbbff..4b43a417b0 100644 --- a/include/dr/mp/algorithms/reduce.hpp +++ b/include/dr/mp/algorithms/reduce.hpp @@ -39,6 +39,13 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) { dr::__detail::direct_iterator(rng::begin(r) + 1), dr::__detail::direct_iterator(rng::end(r)), sycl_get_deref(rng::begin(r)), binary_op); + // We are not using below code, because we don't want to dereference rng::begin(r) + // beyond SYCL environment - the * operator may require complex operation that + // relies on GPU memory access (for example transform view iterator) + // return std::reduce(dpl_policy(), + // dr::__detail::direct_iterator(rng::begin(r) + 1), + // dr::__detail::direct_iterator(rng::end(r)), + // sycl_get(*rng::begin(r)), binary_op); } } #else diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 569ccd4b6d..2313abedb7 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -164,7 +164,7 @@ class csr_row_distribution { if (communicator.rank() == root) { auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); - communicator.gather_typed(partial_res, scratch, + communicator.gather(partial_res, scratch, segment_size_ * vals_width, root); T *temp = nullptr; if (use_sycl()) { @@ -198,7 +198,7 @@ class csr_row_distribution { alloc.deallocate(scratch, segment_size_ * communicator.size() * vals_width); } else { - communicator.gather_typed(partial_res, static_cast(nullptr), + communicator.gather(partial_res, static_cast(nullptr), segment_size_ * vals_width, root); } } From 5a70436c0d805e03090fa492f4c287aa8edf6100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Fri, 17 Jan 2025 11:25:27 +0000 Subject: [PATCH 66/68] Fix formatting --- benchmarks/gbench/mp/gemv.cpp | 2 +- include/dr/detail/communicator.hpp | 5 ++--- include/dr/detail/matrix_entry.hpp | 2 +- include/dr/mp/algorithms/equal.hpp | 2 +- include/dr/mp/algorithms/reduce.hpp | 8 ++++---- .../mp/containers/matrix_formats/csr_row_distribution.hpp | 6 +++--- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp index baffcbd608..44216cbcc5 100644 --- a/benchmarks/gbench/mp/gemv.cpp +++ b/benchmarks/gbench/mp/gemv.cpp @@ -4,13 +4,13 @@ #include "mpi.h" +#include "../common/dr_bench.hpp" #include "dr/mp.hpp" #include #include #include #include #include -#include "../common/dr_bench.hpp" namespace mp = dr::mp; diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index 6b0c9dbdbf..b21e35d8a5 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -67,9 +67,8 @@ class communicator { } template - void gather(const T *src, T *dst, std::size_t count, - std::size_t root) const { - gather((void*)src, (void*)dst, count * sizeof(T),root); + void gather(const T *src, T *dst, std::size_t count, std::size_t root) const { + gather((void *)src, (void *)dst, count * sizeof(T), root); } template diff --git a/include/dr/detail/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp index 10340ff4da..2875b35d9b 100644 --- a/include/dr/detail/matrix_entry.hpp +++ b/include/dr/detail/matrix_entry.hpp @@ -4,10 +4,10 @@ #pragma once +#include #include #include #include -#include #include diff --git a/include/dr/mp/algorithms/equal.hpp b/include/dr/mp/algorithms/equal.hpp index 0f4039cae7..b2b6e278fc 100644 --- a/include/dr/mp/algorithms/equal.hpp +++ b/include/dr/mp/algorithms/equal.hpp @@ -27,7 +27,7 @@ bool equal(std::size_t root, bool root_provided, R1 &&r1, R2 &&r2) { auto zipped_views = views::zip(r1, r2); auto compared = dr::mp::views::transform(zipped_views, compare); - + auto min = [](double x, double y) { return std::min(x, y); }; if (root_provided) { auto result = mp::reduce(root, compared, 1, min); diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp index 4b43a417b0..dd2a16dd68 100644 --- a/include/dr/mp/algorithms/reduce.hpp +++ b/include/dr/mp/algorithms/reduce.hpp @@ -39,10 +39,10 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) { dr::__detail::direct_iterator(rng::begin(r) + 1), dr::__detail::direct_iterator(rng::end(r)), sycl_get_deref(rng::begin(r)), binary_op); - // We are not using below code, because we don't want to dereference rng::begin(r) - // beyond SYCL environment - the * operator may require complex operation that - // relies on GPU memory access (for example transform view iterator) - // return std::reduce(dpl_policy(), + // We are not using below code, because we don't want to dereference + // rng::begin(r) beyond SYCL environment - the * operator may require + // complex operation that relies on GPU memory access (for example + // transform view iterator) return std::reduce(dpl_policy(), // dr::__detail::direct_iterator(rng::begin(r) + 1), // dr::__detail::direct_iterator(rng::end(r)), // sycl_get(*rng::begin(r)), binary_op); diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp index 2313abedb7..70fd7f02d7 100644 --- a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp +++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp @@ -164,8 +164,8 @@ class csr_row_distribution { if (communicator.rank() == root) { auto scratch = alloc.allocate(segment_size_ * communicator.size() * vals_width); - communicator.gather(partial_res, scratch, - segment_size_ * vals_width, root); + communicator.gather(partial_res, scratch, segment_size_ * vals_width, + root); T *temp = nullptr; if (use_sycl()) { temp = new T[res.size()]; @@ -199,7 +199,7 @@ class csr_row_distribution { segment_size_ * communicator.size() * vals_width); } else { communicator.gather(partial_res, static_cast(nullptr), - segment_size_ * vals_width, root); + segment_size_ * vals_width, root); } } void init(dr::views::csr_matrix_view csr_view, auto dist, From eb66f7a924bfe43255414eff233d81c4799d74a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Fri, 17 Jan 2025 11:33:02 +0000 Subject: [PATCH 67/68] Fixed format 2 --- examples/mp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt index 61ca9e509f..a8180838b3 100644 --- a/examples/mp/CMakeLists.txt +++ b/examples/mp/CMakeLists.txt @@ -26,7 +26,6 @@ function(add_mp_example example_name) add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2) endfunction() - add_mp_example(stencil-1d) add_mp_example(stencil-1d-array) add_mp_example(stencil-1d-pointer) From f9bbc1df179486ac7855643fa60102e4aca58ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= Date: Fri, 17 Jan 2025 12:36:50 +0000 Subject: [PATCH 68/68] Fix gather call --- include/dr/detail/communicator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp index b21e35d8a5..d6fa99ffb4 100644 --- a/include/dr/detail/communicator.hpp +++ b/include/dr/detail/communicator.hpp @@ -74,7 +74,7 @@ class communicator { template void gather(const T &src, std::span dst, std::size_t root) const { assert(rng::size(dst) >= size_); - gather(&src, rng::data(dst), sizeof(T), root); + gather(&src, rng::data(dst), 1, root); } template