diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt
index c3ae2f682c..adbf32e34e 100644
--- a/benchmarks/gbench/mp/CMakeLists.txt
+++ b/benchmarks/gbench/mp/CMakeLists.txt
@@ -15,6 +15,7 @@ add_executable(
   ../common/stream.cpp
   streammp.cpp
   rooted.cpp
+  gemv.cpp
   stencil_1d.cpp
   stencil_2d.cpp
   chunk.cpp
@@ -41,7 +42,7 @@ endif()
 # mp-quick-bench is for development. By reducing the number of source files, it
 # builds much faster. Change the source files to match what you need to test. It
 # is OK to commit changes to the source file list.
-add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp)
+add_executable(mp-quick-bench mp-bench.cpp gemv.cpp)
 
 foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench)
   target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP)
diff --git a/benchmarks/gbench/mp/fft3d.cpp b/benchmarks/gbench/mp/fft3d.cpp
index 5e30a57b44..23dcbbdae3 100644
--- a/benchmarks/gbench/mp/fft3d.cpp
+++ b/benchmarks/gbench/mp/fft3d.cpp
@@ -5,7 +5,11 @@
 #include "cxxopts.hpp"
 #include "fmt/core.h"
 #include "mpi.h"
+#if (__INTEL_LLVM_COMPILER >= 20250000)
 #include "oneapi/mkl/dft.hpp"
+#else
+#include "oneapi/mkl/dfti.hpp"
+#endif
 #include <complex>
 
 #include "dr/mp.hpp"
diff --git a/benchmarks/gbench/mp/gemv.cpp b/benchmarks/gbench/mp/gemv.cpp
new file mode 100644
index 0000000000..44216cbcc5
--- /dev/null
+++ b/benchmarks/gbench/mp/gemv.cpp
@@ -0,0 +1,192 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "mpi.h"
+
+#include "../common/dr_bench.hpp"
+#include "dr/mp.hpp"
+#include <filesystem>
+#include <fmt/core.h>
+#include <fstream>
+#include <random>
+#include <sstream>
+
+namespace mp = dr::mp;
+
+namespace {
+std::size_t getWidth() {
+  return 8; // default_vector_size / 100000;
+}
+} // namespace
+static auto getMatrix() {
+  // size below is useful when testing weak scaling with default vector size
+  // using dr-bench it creates matrix which non-zero element count increases
+  // linearly when we increase default_vector_size std::size_t n = std::max(1.,
+  // std::sqrt(default_vector_size / 100000)) * 50000;
+
+  std::size_t density_scalar = 50;
+
+  std::size_t n =
+      std::max(1., std::sqrt(default_vector_size * density_scalar / 2));
+
+  std::size_t up = n / density_scalar;
+  std::size_t down = n / density_scalar;
+  fmt::print("Generate matrix");
+  auto tmp = dr::generate_band_csr<double, long>(n, up, down);
+  fmt::print("generated!");
+  return tmp;
+}
+
+static void GemvEq_DR(benchmark::State &state) {
+  auto local_data = getMatrix();
+
+  mp::distributed_sparse_matrix<
+      double, long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
+      m(local_data, 0);
+  auto n = m.shape()[1];
+  auto width = getWidth();
+  std::vector<double> base_a(n * width);
+  for (int j = 0; j < width; j++) {
+    for (int i = 0; i < n; i++) {
+      base_a[i + j * n] = i * j + 1;
+    }
+  }
+  dr::mp::broadcasted_slim_matrix<double> allocated_a;
+  allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());
+
+  std::vector<double> res(m.shape().first * width);
+  gemv(0, res, m, allocated_a);
+  for (auto _ : state) {
+    gemv(0, res, m, allocated_a);
+  }
+}
+
+DR_BENCHMARK(GemvEq_DR);
+
+static void GemvRow_DR(benchmark::State &state) {
+  auto local_data = getMatrix();
+
+  mp::distributed_sparse_matrix<
+      double, long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
+      m(local_data, 0);
+  auto n = m.shape()[1];
+  auto width = getWidth();
+  std::vector<double> base_a(n * width);
+  for (int j = 0; j < width; j++) {
+    for (int i = 0; i < n; i++) {
+      base_a[i + j * n] = i * j + 1;
+    }
+  }
+  dr::mp::broadcasted_slim_matrix<double> allocated_a;
+  allocated_a.broadcast_data(n, width, 0, base_a, dr::mp::default_comm());
+
+  std::vector<double> res(m.shape().first * width);
+  gemv(0, res, m, allocated_a);
+  for (auto _ : state) {
+    gemv(0, res, m, allocated_a);
+  }
+}
+
+DR_BENCHMARK(GemvRow_DR);
+
+static void Gemv_Reference(benchmark::State &state) {
+  auto local_data = getMatrix();
+  auto nnz_count = local_data.size();
+  auto band_shape = local_data.shape();
+  auto q = get_queue();
+  auto policy = oneapi::dpl::execution::make_device_policy(q);
+  auto val_ptr = sycl::malloc_device<double>(nnz_count, q);
+  auto col_ptr = sycl::malloc_device<long>(nnz_count, q);
+  auto row_ptr = sycl::malloc_device<long>((band_shape[0] + 1), q);
+  std::vector<double> b;
+  auto width = getWidth();
+  for (auto i = 0; i < band_shape[1] * width; i++) {
+    b.push_back(i);
+  }
+  double *elems = new double[band_shape[0] * width];
+  auto input = sycl::malloc_device<double>(band_shape[1] * width, q);
+  auto output = sycl::malloc_device<double>(band_shape[0] * width, q);
+  q.memcpy(val_ptr, local_data.values_data(), nnz_count * sizeof(double))
+      .wait();
+  q.memcpy(col_ptr, local_data.colind_data(), nnz_count * sizeof(long)).wait();
+  q.memcpy(row_ptr, local_data.rowptr_data(),
+           (band_shape[0] + 1) * sizeof(long))
+      .wait();
+  q.fill(output, 0, band_shape[0] * width);
+  std::copy(policy, b.begin(), b.end(), input);
+
+  auto wg = 32;
+  while (width * band_shape[0] * wg > INT_MAX) {
+    wg /= 2;
+  }
+  assert(wg > 0);
+
+  for (auto _ : state) {
+    if (dr::mp::use_sycl()) {
+      dr::mp::sycl_queue()
+          .submit([&](auto &&h) {
+            h.parallel_for(
+                sycl::nd_range<1>(width * band_shape[0] * wg, wg),
+                [=](auto item) {
+                  auto input_j = item.get_group(0) / band_shape[0];
+                  auto idx = item.get_group(0) % band_shape[0];
+                  auto local_id = item.get_local_id();
+                  auto group_size = item.get_local_range(0);
+                  double sum = 0;
+                  auto start = row_ptr[idx];
+                  auto end = row_ptr[idx + 1];
+                  for (auto i = start + local_id; i < end; i += group_size) {
+                    auto colNum = col_ptr[i];
+                    auto vectorVal = input[colNum + input_j * band_shape[1]];
+                    auto matrixVal = val_ptr[i];
+                    sum += matrixVal * vectorVal;
+                  }
+                  sycl::atomic_ref<double, sycl::memory_order::relaxed,
+                                   sycl::memory_scope::device>
+                      c_ref(output[idx + band_shape[0] * input_j]);
+                  c_ref += sum;
+                });
+          })
+          .wait();
+      q.memcpy(elems, output, band_shape[0] * sizeof(double) * width).wait();
+    } else {
+      std::fill(elems, elems + band_shape[0] * width, 0);
+      auto local_rows = local_data.rowptr_data();
+      auto row_i = 0;
+      auto current_row_position = local_rows[1];
+
+      for (int i = 0; i < nnz_count; i++) {
+        while (row_i + 1 < band_shape[0] && i >= current_row_position) {
+          row_i++;
+          current_row_position = local_rows[row_i + 1];
+        }
+        for (auto j = 0; j < width; j++) {
+          auto item_id = row_i + j * band_shape[0];
+          auto val_index = local_data.colind_data()[i] + j * band_shape[0];
+          auto value = b[val_index];
+          auto matrix_value = local_data.values_data()[i];
+          elems[item_id] += matrix_value * value;
+        }
+      }
+    }
+  }
+  delete[] elems;
+  sycl::free(val_ptr, q);
+  sycl::free(col_ptr, q);
+  sycl::free(row_ptr, q);
+  sycl::free(input, q);
+  sycl::free(output, q);
+}
+
+static void GemvEq_Reference(benchmark::State &state) { Gemv_Reference(state); }
+
+static void GemvRow_Reference(benchmark::State &state) {
+  Gemv_Reference(state);
+}
+
+DR_BENCHMARK(GemvEq_Reference);
+
+DR_BENCHMARK(GemvRow_Reference);
diff --git a/benchmarks/gbench/sp/fft3d.cpp b/benchmarks/gbench/sp/fft3d.cpp
index 19d4f3aee4..cdf060c88c 100644
--- a/benchmarks/gbench/sp/fft3d.cpp
+++ b/benchmarks/gbench/sp/fft3d.cpp
@@ -3,7 +3,11 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "cxxopts.hpp"
+#if (__INTEL_LLVM_COMPILER >= 20250000)
 #include "oneapi/mkl/dft.hpp"
+#else
+#include "oneapi/mkl/dfti.hpp"
+#endif
 #include <complex>
 #include <dr/sp.hpp>
 #include <fmt/core.h>
diff --git a/examples/mp/CMakeLists.txt b/examples/mp/CMakeLists.txt
index 41cac86ecc..a8180838b3 100644
--- a/examples/mp/CMakeLists.txt
+++ b/examples/mp/CMakeLists.txt
@@ -16,9 +16,13 @@ add_executable(vector-add vector-add.cpp)
 target_link_libraries(vector-add DR::mpi)
 add_mp_ctest(TEST_NAME vector-add NAME vector-add NPROC 2)
 
-function(add_mp_example example_name)
+function(add_mp_example_no_test example_name)
   add_executable(${example_name} ${example_name}.cpp)
   target_link_libraries(${example_name} cxxopts DR::mpi)
+endfunction()
+
+function(add_mp_example example_name)
+  add_mp_example_no_test(${example_name})
   add_mp_ctest(TEST_NAME ${example_name} NAME ${example_name} NPROC 2)
 endfunction()
 
@@ -26,6 +30,9 @@ add_mp_example(stencil-1d)
 add_mp_example(stencil-1d-array)
 add_mp_example(stencil-1d-pointer)
 add_mp_example(hello_world)
+add_mp_example_no_test(sparse_matrix)
+add_mp_example_no_test(sparse_benchmark)
+add_mp_example_no_test(sparse_matrix_matrix_mul)
 
 if(OpenMP_FOUND)
   add_executable(vector-add-ref vector-add-ref.cpp)
diff --git a/examples/mp/sparse_benchmark.cpp b/examples/mp/sparse_benchmark.cpp
new file mode 100644
index 0000000000..b53c0ee477
--- /dev/null
+++ b/examples/mp/sparse_benchmark.cpp
@@ -0,0 +1,130 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <dr/mp.hpp>
+#include <filesystem>
+#include <fmt/core.h>
+#include <fstream>
+#include <random>
+#include <sstream>
+
+namespace mp = dr::mp;
+
+MPI_Comm comm;
+int comm_rank;
+int comm_size;
+
+int main(int argc, char **argv) {
+
+  MPI_Init(&argc, &argv);
+  comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &comm_rank);
+  MPI_Comm_size(comm, &comm_size);
+
+  if (argc != 3 && argc != 5) {
+    fmt::print("usage: ./sparse_benchmark [test outcome dir] [matrix market "
+               "file], or ./sparse_benchmark [test outcome dir] [number of "
+               "rows] [number of columns] [density]\n");
+    return 1;
+  }
+
+#ifdef SYCL_LANGUAGE_VERSION
+  sycl::queue q = dr::mp::select_queue();
+  mp::init(q);
+#else
+  mp::init();
+#endif
+  dr::views::csr_matrix_view<double, long> local_data;
+  std::stringstream filenamestream;
+  auto root = 0;
+  auto computeSize = dr::mp::default_comm().size();
+  if (root == dr::mp::default_comm().rank()) {
+    if (argc == 5) {
+      fmt::print("started loading\n");
+      auto n = std::stoul(argv[2]);
+      auto up = std::stoul(argv[3]);
+      auto down = std::stoul(argv[4]);
+      // local_data = dr::generate_random_csr<double, long>({n, m}, density,
+      // 42);
+      local_data = dr::generate_band_csr<double, long>(n, up, down);
+      filenamestream << "mp_band_" << computeSize << "_" << n << "_"
+                     << up + down << "_" << local_data.size();
+      fmt::print("finished loading\n");
+    } else {
+      fmt::print("started loading\n");
+      std::string fname(argv[2]);
+      std::filesystem::path p(argv[2]);
+      local_data = dr::read_csr<double, long>(fname);
+      filenamestream << "mp_" << p.stem().string() << "_" << computeSize << "_"
+                     << local_data.size();
+      fmt::print("finished loading\n");
+    }
+  }
+  std::string resname;
+  mp::distributed_sparse_matrix<
+      double, long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
+      m_eq(local_data, root);
+  mp::distributed_sparse_matrix<
+      double, long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
+      m_row(local_data, root);
+  fmt::print("finished distribution\n");
+  std::vector<double> eq_duration;
+  std::vector<double> row_duration;
+
+  auto N = 10;
+  std::vector<double> b;
+  b.reserve(m_row.shape().second);
+  std::vector<double> res(m_row.shape().first);
+  for (auto i = 0; i < m_row.shape().second; i++) {
+    b.push_back(i);
+  }
+
+  dr::mp::broadcasted_vector<double> allocated_b;
+  allocated_b.broadcast_data(m_row.shape().second, 0, b,
+                             dr::mp::default_comm());
+
+  fmt::print("started initial gemv distribution\n");
+  gemv(0, res, m_eq, allocated_b); // it is here to prepare sycl for work
+
+  fmt::print("finished initial gemv distribution\n");
+  for (auto i = 0; i < N; i++) {
+    auto begin = std::chrono::high_resolution_clock::now();
+    gemv(0, res, m_eq, allocated_b);
+    auto end = std::chrono::high_resolution_clock::now();
+    double duration = std::chrono::duration<double>(end - begin).count() * 1000;
+    eq_duration.push_back(duration);
+  }
+
+  gemv(0, res, m_row, allocated_b); // it is here to prepare sycl for work
+  for (auto i = 0; i < N; i++) {
+    auto begin = std::chrono::high_resolution_clock::now();
+    gemv(0, res, m_row, allocated_b);
+    auto end = std::chrono::high_resolution_clock::now();
+    double duration = std::chrono::duration<double>(end - begin).count() * 1000;
+    row_duration.push_back(duration);
+  }
+
+  if (root == dr::mp::default_comm().rank()) {
+    std::string tmp;
+    filenamestream >> tmp;
+    std::filesystem::path p(argv[1]);
+    p += tmp;
+    p += ".csv";
+    std::ofstream write_stream(p.string());
+    write_stream << eq_duration.front();
+    for (auto i = 1; i < N; i++) {
+      write_stream << "," << eq_duration[i];
+    }
+    write_stream << "\n";
+    write_stream << row_duration.front();
+    for (auto i = 1; i < N; i++) {
+      write_stream << "," << row_duration[i];
+    }
+    write_stream << "\n";
+  }
+  allocated_b.destroy_data();
+  mp::finalize();
+}
diff --git a/examples/mp/sparse_matrix.cpp b/examples/mp/sparse_matrix.cpp
new file mode 100644
index 0000000000..ae1511280c
--- /dev/null
+++ b/examples/mp/sparse_matrix.cpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <dr/mp.hpp>
+#include <fmt/core.h>
+
+namespace mp = dr::mp;
+
+int main(int argc, char **argv) {
+
+  if (argc != 2) {
+    fmt::print("usage: ./sparse_matrix [matrix market file]\n");
+    return 1;
+  }
+
+  std::string fname(argv[1]);
+#ifdef SYCL_LANGUAGE_VERSION
+  mp::init(sycl::default_selector_v);
+#else
+  mp::init();
+#endif
+
+  dr::views::csr_matrix_view<double, long> local_data;
+  auto root = 0;
+  if (root == dr::mp::default_comm().rank()) {
+    local_data = dr::read_csr<double, long>(fname);
+  }
+  {
+    mp::distributed_sparse_matrix<
+        double, long, dr::mp::MpiBackend,
+        dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
+        m(local_data, root);
+    mp::distributed_sparse_matrix<
+        double, long, dr::mp::MpiBackend,
+        dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
+        m_row(local_data, root);
+    std::vector<double> res(m.shape().first);
+    std::vector<double> res_row(m.shape().first);
+    std::vector<double> a(m.shape().second);
+    for (int i = 0; i < a.size(); i++) {
+      a[i] = i;
+    }
+
+    dr::mp::broadcasted_vector<double> allocated_a;
+    allocated_a.broadcast_data(m_row.shape().second, 0, a,
+                               dr::mp::default_comm());
+    m.fence();
+    double total_time = 0;
+    auto N = 1;
+    gemv(0, res, m, allocated_a); // it is here to prepare sycl for work
+    for (int i = 0; i < N; i++) {
+      auto begin = std::chrono::high_resolution_clock::now();
+      gemv(0, res, m, allocated_a);
+      auto end = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration<double>(end - begin).count();
+      total_time += duration;
+      if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) {
+        fmt::print("eq canary {}\n", duration * 1000);
+      }
+    }
+    if (root == dr::mp::default_comm().rank()) {
+      fmt::print("eq gemv time total {}\n", total_time * 1000 / N);
+    }
+    m.fence();
+    total_time = 0;
+    gemv(0, res_row, m_row, allocated_a);
+    for (int i = 0; i < N; i++) {
+      auto begin = std::chrono::high_resolution_clock::now();
+      gemv(0, res_row, m_row, allocated_a);
+      auto end = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration<double>(end - begin).count();
+      total_time += duration;
+      if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) {
+        fmt::print("row canary {}\n", duration * 1000);
+      }
+    }
+
+    if (root == dr::mp::default_comm().rank()) {
+      fmt::print("row gemv time total {}\n", total_time * 1000 / N);
+    }
+    m_row.fence();
+
+    std::vector<double> ref(m.shape().first);
+    if (dr::mp::default_comm().rank() == 0) {
+      for (auto a : local_data) {
+        auto [index, val] = a;
+        auto [m, n] = index;
+        ref[m] += n * val;
+      }
+      for (int i = 0; i < m.shape().first; i++) {
+        if (res[i] != ref[i]) {
+          fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]);
+        }
+      }
+      for (int i = 0; i < m.shape().first; i++) {
+        if (res_row[i] != ref[i]) {
+          fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i],
+                     ref[i]);
+        }
+      }
+    }
+    allocated_a.destroy_data();
+  }
+
+  if (root == dr::mp::default_comm().rank()) {
+    dr::__detail::destroy_csr_matrix_view(local_data, std::allocator<double>{});
+  }
+  mp::finalize();
+
+  return 0;
+}
diff --git a/examples/mp/sparse_matrix_matrix_mul.cpp b/examples/mp/sparse_matrix_matrix_mul.cpp
new file mode 100644
index 0000000000..4fd7c139c8
--- /dev/null
+++ b/examples/mp/sparse_matrix_matrix_mul.cpp
@@ -0,0 +1,118 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <dr/mp.hpp>
+#include <fmt/core.h>
+
+namespace mp = dr::mp;
+
+int main(int argc, char **argv) {
+
+  if (argc != 2) {
+    fmt::print("usage: ./sparse_matrix [matrix market file]\n");
+    return 1;
+  }
+
+  std::string fname(argv[1]);
+#ifdef SYCL_LANGUAGE_VERSION
+  mp::init(sycl::default_selector_v);
+#else
+  mp::init();
+#endif
+
+  dr::views::csr_matrix_view<double, long> local_data;
+  auto root = 0;
+  if (root == dr::mp::default_comm().rank()) {
+    local_data = dr::read_csr<double, long>(fname);
+  }
+  {
+    mp::distributed_sparse_matrix<
+        double, long, dr::mp::MpiBackend,
+        dr::mp::csr_eq_distribution<double, long, dr::mp::MpiBackend>>
+        m(local_data, root);
+    mp::distributed_sparse_matrix<
+        double, long, dr::mp::MpiBackend,
+        dr::mp::csr_row_distribution<double, long, dr::mp::MpiBackend>>
+        m_row(local_data, root);
+    fmt::print("{}\n", m.size());
+
+    auto width = 3;
+    std::vector<double> res(m.shape().first * width);
+    std::vector<double> res_row(m.shape().first * width);
+    std::vector<double> base_a(m.shape().second * width);
+    for (int j = 0; j < width; j++) {
+      for (int i = 0; i < m.shape().second; i++) {
+        base_a[i + j * m.shape().second] = i * j + 1;
+      }
+    }
+
+    dr::mp::broadcasted_slim_matrix<double> allocated_a;
+    allocated_a.broadcast_data(m_row.shape().second, width, 0, base_a,
+                               dr::mp::default_comm());
+    m.fence();
+    double total_time = 0;
+    auto N = 1;
+    gemv(0, res, m, allocated_a); // it is here to prepare sycl for work
+    for (int i = 0; i < 100; i++) {
+      auto begin = std::chrono::high_resolution_clock::now();
+      gemv(0, res, m, allocated_a);
+      auto end = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration<double>(end - begin).count();
+      total_time += duration;
+      if (root == dr::mp::default_comm().rank()) {
+        fmt::print("eq canary {}\n\n", duration * 1000);
+      }
+    }
+    m.fence();
+    total_time = 0;
+    gemv(0, res_row, m_row, allocated_a);
+    for (int i = 0; i < N; i++) {
+      auto begin = std::chrono::high_resolution_clock::now();
+      gemv(0, res_row, m_row, allocated_a);
+      auto end = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration<double>(end - begin).count();
+      total_time += duration;
+      if (i % 10 == 0 && dr::mp::default_comm().rank() == 0) {
+        fmt::print("row canary {}\n", duration * 1000);
+      }
+    }
+
+    if (root == dr::mp::default_comm().rank()) {
+      fmt::print("row gemv time total {}\n", total_time * 1000 / N);
+    }
+    m_row.fence();
+
+    std::vector<double> ref(m.shape().first * width);
+    auto res_col_len = m.shape().first;
+    auto in_len = m.shape().second;
+    if (dr::mp::default_comm().rank() == 0) {
+      for (auto a : local_data) {
+        auto [index, val] = a;
+        auto [m, n] = index;
+        for (int i = 0; i < width; i++) {
+          ref[m + i * res_col_len] += base_a[n + i * in_len] * val;
+        }
+      }
+      for (int i = 0; i < m.shape().first * width; i++) {
+        if (res[i] != ref[i]) {
+          fmt::print("mismatching outcome {} {} {}\n", i, res[i], ref[i]);
+        }
+      }
+      for (int i = 0; i < m.shape().first * width; i++) {
+        if (res_row[i] != ref[i]) {
+          fmt::print("mismatching outcome row {} {} {}\n", i, res_row[i],
+                     ref[i]);
+        }
+      }
+    }
+    allocated_a.destroy_data();
+  }
+
+  if (root == dr::mp::default_comm().rank()) {
+    dr::__detail::destroy_csr_matrix_view(local_data, std::allocator<double>{});
+  }
+  mp::finalize();
+
+  return 0;
+}
diff --git a/include/dr/detail/communicator.hpp b/include/dr/detail/communicator.hpp
index 331253ab63..d6fa99ffb4 100644
--- a/include/dr/detail/communicator.hpp
+++ b/include/dr/detail/communicator.hpp
@@ -66,10 +66,15 @@ class communicator {
     MPI_Gather_c(src, count, MPI_BYTE, dst, count, MPI_BYTE, root, mpi_comm_);
   }
 
+  template <typename T>
+  void gather(const T *src, T *dst, std::size_t count, std::size_t root) const {
+    gather((void *)src, (void *)dst, count * sizeof(T), root);
+  }
+
   template <typename T>
   void gather(const T &src, std::span<T> dst, std::size_t root) const {
     assert(rng::size(dst) >= size_);
-    gather(&src, rng::data(dst), sizeof(T), root);
+    gather(&src, rng::data(dst), 1, root);
   }
 
   template <typename T>
@@ -105,10 +110,10 @@ class communicator {
     i_all_gather(&src, rng::data(dst), 1, req);
   }
 
-  void gatherv(const void *src, int *counts, int *offsets, void *dst,
+  void gatherv(const void *src, MPI_Count *counts, MPI_Aint *offsets, void *dst,
                std::size_t root) const {
-    MPI_Gatherv(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE,
-                root, mpi_comm_);
+    MPI_Gatherv_c(src, counts[rank()], MPI_BYTE, dst, counts, offsets, MPI_BYTE,
+                  root, mpi_comm_);
   }
 
   // pointer with explicit tag
@@ -167,6 +172,13 @@ class communicator {
     irecv(data, src_rank, 0, request);
   }
 
+  void wait(MPI_Request request) const {
+    MPI_Wait(&request, MPI_STATUS_IGNORE);
+  }
+  void waitall(std::size_t count, MPI_Request *requests) const {
+    MPI_Waitall(count, requests, MPI_STATUS_IGNORE);
+  }
+
   template <rng::contiguous_range R>
   void alltoall(const R &sendr, R &recvr, std::size_t count) {
     alltoall(rng::data(sendr), rng::data(recvr), count);
diff --git a/include/dr/sp/util/coo_matrix.hpp b/include/dr/detail/coo_matrix.hpp
similarity index 91%
rename from include/dr/sp/util/coo_matrix.hpp
rename to include/dr/detail/coo_matrix.hpp
index 0e8b871504..1cf8e4bc83 100644
--- a/include/dr/sp/util/coo_matrix.hpp
+++ b/include/dr/detail/coo_matrix.hpp
@@ -4,18 +4,18 @@
 
 #pragma once
 
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <memory>
 #include <vector>
 
-namespace dr::sp {
+namespace dr {
 
 namespace __detail {
 
 template <typename T, typename I, typename Allocator = std::allocator<T>>
 class coo_matrix {
 public:
-  using value_type = dr::sp::matrix_entry<T, I>;
+  using value_type = dr::matrix_entry<T, I>;
   using scalar_type = T;
   using index_type = I;
   using size_type = std::size_t;
@@ -33,8 +33,8 @@ class coo_matrix {
   using iterator = typename backend_type::iterator;
   using const_iterator = typename backend_type::const_iterator;
 
-  using reference = dr::sp::matrix_ref<T, I>;
-  using const_reference = dr::sp::matrix_ref<std::add_const_t<T>, I>;
+  using reference = dr::matrix_ref<T, I>;
+  using const_reference = dr::matrix_ref<std::add_const_t<T>, I>;
 
   using scalar_reference = T &;
 
@@ -110,14 +110,14 @@ class coo_matrix {
   }
 
   iterator find(key_type key) noexcept {
-    return std::find_if(begin(), end(), [&](auto &&v) {
+    return std::ranges::find_if(begin(), end(), [&](auto &&v) {
       auto &&[i, v_] = v;
       return i == key;
     });
   }
 
   const_iterator find(key_type key) const noexcept {
-    return std::find_if(begin(), end(), [&](auto &&v) {
+    return std::ranges::find_if(begin(), end(), [&](auto &&v) {
       auto &&[i, v_] = v;
       return i == key;
     });
@@ -167,4 +167,4 @@ class coo_matrix {
 
 } // namespace __detail
 
-} // namespace dr::sp
+} // namespace dr
diff --git a/include/dr/detail/csr_matrix_base.hpp b/include/dr/detail/csr_matrix_base.hpp
new file mode 100644
index 0000000000..0207a95428
--- /dev/null
+++ b/include/dr/detail/csr_matrix_base.hpp
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <dr/detail/matrix_entry.hpp>
+#include <future>
+#include <memory>
+#include <vector>
+
+namespace dr {
+
+namespace __detail {
+
+template <typename T, typename I, typename Allocator = std::allocator<T>>
+class csr_matrix_base {
+public:
+  using value_type = std::pair<T, I>;
+  using scalar_type = T;
+  using index_type = I;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  using allocator_type = Allocator;
+
+  using key_type = dr::index<I>;
+  using map_type = T;
+
+  using backend_allocator_type = typename std::allocator_traits<
+      allocator_type>::template rebind_alloc<value_type>;
+  using aggregator_allocator_type = typename std::allocator_traits<
+      allocator_type>::template rebind_alloc<std::vector<value_type>>;
+  using row_type = std::vector<value_type, backend_allocator_type>;
+  using backend_type = std::vector<row_type, aggregator_allocator_type>;
+
+  using iterator = typename backend_type::iterator;
+  using const_iterator = typename backend_type::const_iterator;
+
+  csr_matrix_base(dr::index<I> shape, std::size_t nnz) : shape_(shape) {
+    auto average_size = nnz / shape.first / 2;
+    for (std::size_t i = 0; i < shape.first; i++) {
+      tuples_.push_back(row_type());
+      tuples_.back().reserve(average_size);
+    }
+  }
+
+  dr::index<I> shape() const noexcept { return shape_; }
+
+  size_type size() const noexcept { return size_; }
+
+  iterator begin() noexcept { return tuples_.begin(); }
+
+  const_iterator begin() const noexcept { return tuples_.begin(); }
+
+  iterator end() noexcept { return tuples_.end(); }
+
+  const_iterator end() const noexcept { return tuples_.end(); }
+
+  template <typename InputIt> void push_back(InputIt first, InputIt last) {
+    for (auto iter = first; iter != last; ++iter) {
+      push_back(*iter);
+    }
+  }
+
+  void push_back(index_type row, const value_type &value) {
+    tuples_[row].push_back(value);
+    size_++;
+  }
+
+  void sort() {
+    auto comparator = [](auto &one, auto &two) {
+      return one.second < two.second;
+    };
+    for (auto &elem : tuples_) {
+      std::sort(elem.begin(), elem.end(), comparator);
+    }
+  }
+
+  csr_matrix_base() = default;
+  ~csr_matrix_base() = default;
+  csr_matrix_base(const csr_matrix_base &) = default;
+  csr_matrix_base(csr_matrix_base &&) = default;
+  csr_matrix_base &operator=(const csr_matrix_base &) = default;
+  csr_matrix_base &operator=(csr_matrix_base &&) = default;
+
+private:
+  std::size_t size_ = 0;
+  dr::index<I> shape_;
+  backend_type tuples_;
+};
+
+} // namespace __detail
+
+} // namespace dr
diff --git a/include/dr/detail/generate_csr.hpp b/include/dr/detail/generate_csr.hpp
new file mode 100644
index 0000000000..c8ef7202a2
--- /dev/null
+++ b/include/dr/detail/generate_csr.hpp
@@ -0,0 +1,155 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <concepts>
+#include <dr/views/csr_matrix_view.hpp>
+#include <fmt/core.h>
+#include <random>
+#include <unordered_set>
+
+namespace dr {
+
+namespace {
+
+template <typename T> struct uniform_distribution {
+  using type = std::uniform_int_distribution<T>;
+};
+
+template <std::floating_point T> struct uniform_distribution<T> {
+  using type = std::uniform_real_distribution<T>;
+};
+
+template <typename T>
+using uniform_distribution_t = typename uniform_distribution<T>::type;
+
+struct pair_hash {
+  template <std::integral I>
+  inline std::size_t operator()(const std::pair<I, I> &v) const {
+    return v.first * 31 + v.second;
+  }
+};
+
+} // namespace
+
+// it returns matrix view of randomly generated matrix
+// the memory is owned by the view, so it needs to be released using
+// destroy_csr_matrix_view
+template <typename T = float, std::integral I = std::size_t>
+auto generate_random_csr(dr::index<I> shape, double density = 0.01,
+                         unsigned int seed = 0) {
+
+  assert(density >= 0.0 && density < 1.0);
+
+  std::unordered_set<std::pair<I, I>, pair_hash> tuples{};
+  std::vector<std::pair<std::pair<I, I>, T>> entries;
+  std::size_t nnz = density * shape[0] * shape[1];
+  entries.reserve(nnz);
+
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<I> row(0, shape[0] - 1);
+  std::uniform_int_distribution<I> column(0, shape[1] - 1);
+
+  uniform_distribution_t<T> value_gen(0, 1);
+
+  while (tuples.size() < nnz) {
+    auto i = row(gen);
+    auto j = column(gen);
+    if (tuples.find({i, j}) == tuples.end()) {
+      T value = value_gen(gen);
+      tuples.insert({i, j});
+      entries.push_back({{i, j}, value});
+    }
+  }
+  T *values = new T[nnz];
+  I *rowptr = new I[shape[0] + 1];
+  I *colind = new I[nnz];
+
+  rowptr[0] = 0;
+
+  std::size_t r = 0;
+  std::size_t c = 0;
+  std::sort(entries.begin(), entries.end());
+  for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
+    auto &&[index, value] = *iter;
+    auto &&[i, j] = index;
+
+    values[c] = value;
+    colind[c] = j;
+
+    while (r < i) {
+      if (r + 1 > shape[0]) {
+        // TODO: exception?
+        // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
+      }
+      rowptr[r + 1] = c;
+      r++;
+    }
+    c++;
+
+    if (c > nnz) {
+      // TODO: exception?
+      // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
+    }
+  }
+
+  for (; r < shape[0]; r++) {
+    rowptr[r + 1] = nnz;
+  }
+
+  return dr::views::csr_matrix_view(values, rowptr, colind, shape, nnz, 0);
+}
+
+// it returns matrix view of band matrix
+// the memory is owned by the view, so it needs to be released using
+// destroy_csr_matrix_view
+template <typename T = float, std::integral I = std::size_t>
+auto generate_band_csr(I size, std::size_t up_band = 3,
+                       std::size_t down_band = 3) {
+  std::size_t nnz = (1 + up_band + down_band) * size -
+                    (up_band * (up_band + 1) / 2) -
+                    (down_band * (down_band + 1) / 2);
+
+  T *values = new T[nnz];
+  I *rowptr = new I[size + 1];
+  I *colind = new I[nnz];
+
+  rowptr[0] = 0;
+
+  std::size_t r = 0;
+  std::size_t c = 0;
+  for (auto i = 0; i < size; i++) {
+    for (auto j = std::max(static_cast<long long>(i) -
+                               static_cast<long long>(down_band),
+                           static_cast<long long>(0));
+         j < i; j++) {
+      values[c] = 1;
+      colind[c] = static_cast<I>(j);
+      c++;
+    }
+    values[c] = 1;
+    colind[c] = i;
+    c++;
+    for (auto j = i + 1; j <= i + up_band; j++) {
+      if (j >= size) {
+        continue;
+      }
+      values[c] = 1;
+      colind[c] = j;
+      c++;
+    }
+    rowptr[r + 1] = c;
+    r++;
+  }
+
+  for (; r < size; r++) {
+    rowptr[r + 1] = nnz;
+  }
+
+  return dr::views::csr_matrix_view<T, I>(values, rowptr, colind, {size, size},
+                                          nnz, 0);
+}
+
+} // namespace dr
diff --git a/include/dr/sp/containers/matrix_entry.hpp b/include/dr/detail/matrix_entry.hpp
similarity index 83%
rename from include/dr/sp/containers/matrix_entry.hpp
rename to include/dr/detail/matrix_entry.hpp
index b5d2885499..2875b35d9b 100644
--- a/include/dr/sp/containers/matrix_entry.hpp
+++ b/include/dr/detail/matrix_entry.hpp
@@ -4,14 +4,19 @@
 
 #pragma once
 
+#include <cassert>
 #include <concepts>
 #include <limits>
 #include <type_traits>
 
 #include <dr/detail/index.hpp>
 
-namespace dr::sp {
-
+namespace dr {
+template <typename T>
+concept getable = requires(T x) {
+  std::get<0>(x);
+  std::get<1>(x);
+};
 template <typename T, typename I = std::size_t> class matrix_entry {
 public:
   using index_type = I;
@@ -28,6 +33,7 @@ template <typename T, typename I = std::size_t> class matrix_entry {
       : index_(index), value_(std::forward<U>(value)) {}
 
   template <typename Entry>
+    requires(getable<Entry>)
   matrix_entry(Entry &&entry)
       : index_(std::get<0>(entry)), value_(std::get<1>(entry)) {}
 
@@ -62,14 +68,11 @@ template <typename T, typename I = std::size_t> class matrix_entry {
     return matrix_entry<std::add_const_t<T>, U>(index_, value_);
   }
 
-  bool operator<(const matrix_entry &other) const noexcept {
-    if (index()[0] < other.index()[0]) {
-      return true;
-    } else if (index()[0] == other.index()[0] &&
-               index()[1] < other.index()[1]) {
-      return true;
+  inline bool operator<(const matrix_entry &other) const noexcept {
+    if (index_.first != other.index_.first) {
+      return index_.first < other.index_.first;
     }
-    return false;
+    return index_.second < other.index_.second;
   }
 
   matrix_entry() = default;
@@ -85,28 +88,28 @@ template <typename T, typename I = std::size_t> class matrix_entry {
   map_type value_;
 };
 
-} // namespace dr::sp
+} // namespace dr
 
 namespace std {
 
 template <typename T, typename I>
   requires(!std::is_const_v<T>)
-void swap(dr::sp::matrix_entry<T, I> a, dr::sp::matrix_entry<T, I> b) {
-  dr::sp::matrix_entry<T, I> other = a;
+void swap(dr::matrix_entry<T, I> a, dr::matrix_entry<T, I> b) {
+  dr::matrix_entry<T, I> other = a;
   a = b;
   b = other;
 }
 
 template <std::size_t Index, typename T, typename I>
-struct tuple_element<Index, dr::sp::matrix_entry<T, I>>
+struct tuple_element<Index, dr::matrix_entry<T, I>>
     : tuple_element<Index, std::tuple<dr::index<I>, T>> {};
 
 template <typename T, typename I>
-struct tuple_size<dr::sp::matrix_entry<T, I>> : integral_constant<size_t, 2> {};
+struct tuple_size<dr::matrix_entry<T, I>> : integral_constant<size_t, 2> {};
 
 } // namespace std
 
-namespace dr::sp {
+namespace dr {
 
 template <typename T, typename I = std::size_t, typename TRef = T &>
 class matrix_ref {
@@ -119,7 +122,7 @@ class matrix_ref {
 
   using scalar_reference = TRef;
 
-  using value_type = dr::sp::matrix_entry<T, I>;
+  using value_type = dr::matrix_entry<T, I>;
 
   matrix_ref(dr::index<I> index, scalar_reference value)
       : index_(index), value_(value) {}
@@ -183,28 +186,28 @@ class matrix_ref {
   scalar_reference value_;
 };
 
-} // namespace dr::sp
+} // namespace dr
 
 namespace std {
 
 template <typename T, typename I, typename TRef>
   requires(!std::is_const_v<T>)
-void swap(dr::sp::matrix_ref<T, I, TRef> a, dr::sp::matrix_ref<T, I, TRef> b) {
-  dr::sp::matrix_entry<T, I> other = a;
+void swap(dr::matrix_ref<T, I, TRef> a, dr::matrix_ref<T, I, TRef> b) {
+  dr::matrix_entry<T, I> other = a;
   a = b;
   b = other;
 }
 
 template <std::size_t Index, typename T, typename I, typename TRef>
-struct tuple_element<Index, dr::sp::matrix_ref<T, I, TRef>>
+struct tuple_element<Index, dr::matrix_ref<T, I, TRef>>
     : tuple_element<Index, std::tuple<dr::index<I>, TRef>> {};
 
 template <typename T, typename I, typename TRef>
-struct tuple_size<dr::sp::matrix_ref<T, I, TRef>>
+struct tuple_size<dr::matrix_ref<T, I, TRef>>
     : integral_constant<std::size_t, 2> {};
 
 template <std::size_t Index, typename T, typename I, typename TRef>
-inline decltype(auto) get(dr::sp::matrix_ref<T, I, TRef> ref)
+inline decltype(auto) get(dr::matrix_ref<T, I, TRef> ref)
   requires(Index <= 1)
 {
   if constexpr (Index == 0) {
@@ -216,7 +219,7 @@ inline decltype(auto) get(dr::sp::matrix_ref<T, I, TRef> ref)
 }
 
 template <std::size_t Index, typename T, typename I, typename TRef>
-inline decltype(auto) get(dr::sp::matrix_entry<T, I> entry)
+inline decltype(auto) get(dr::matrix_entry<T, I> entry)
   requires(Index <= 1)
 {
   if constexpr (Index == 0) {
diff --git a/include/dr/detail/matrix_io.hpp b/include/dr/detail/matrix_io.hpp
new file mode 100644
index 0000000000..1a845008b1
--- /dev/null
+++ b/include/dr/detail/matrix_io.hpp
@@ -0,0 +1,257 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <dr/detail/csr_matrix_base.hpp>
+#include <dr/views/csr_matrix_view.hpp>
+
+namespace dr {
+
+namespace __detail {
+
+// Preconditions:
+// 1) `tuples` sorted by row, column
+// 2) `tuples` has shape `shape`
+// 3) `tuples` has `nnz` elements
+template <typename Tuples, typename Allocator>
+auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz,
+                    Allocator &&allocator) {
+  auto &&[index, v] = *tuples.begin();
+  auto &&[i, j] = index;
+
+  using T = std::remove_reference_t<decltype(v)>;
+  using I = std::remove_reference_t<decltype(i)>;
+
+  typename std::allocator_traits<Allocator>::template rebind_alloc<I>
+      i_allocator(allocator);
+
+  T *values = allocator.allocate(nnz);
+  I *rowptr = i_allocator.allocate(shape[0] + 1);
+  I *colind = i_allocator.allocate(nnz);
+
+  rowptr[0] = 0;
+
+  std::size_t r = 0;
+  std::size_t c = 0;
+  for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) {
+    auto &&[index, value] = *iter;
+
+    auto &&[i, j] = index;
+
+    values[c] = value;
+    colind[c] = j;
+
+    while (r < i) {
+      assert(r + 1 <= shape[0]);
+      // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
+      rowptr[r + 1] = c;
+      r++;
+    }
+    c++;
+
+    assert(c <= nnz);
+    // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
+  }
+
+  for (; r < shape[0]; r++) {
+    rowptr[r + 1] = nnz;
+  }
+
+  return dr::views::csr_matrix_view(values, rowptr, colind,
+                                    dr::index<I>(shape[0], shape[1]), nnz, 0);
+}
+
+template <typename Tuples, typename Allocator>
+auto convert_csr_base_to_csr(Tuples &&csr_matrix, dr::index<> shape,
+                             std::size_t nnz, Allocator &&allocator) {
+  auto &&[v, j] = *csr_matrix.begin()->begin();
+
+  using T = std::remove_reference_t<decltype(v)>;
+  using I = std::remove_reference_t<decltype(j)>;
+
+  typename std::allocator_traits<Allocator>::template rebind_alloc<I>
+      i_allocator(allocator);
+
+  T *values = allocator.allocate(nnz);
+  I *rowptr = i_allocator.allocate(shape[0] + 1);
+  I *colind = i_allocator.allocate(nnz);
+
+  rowptr[0] = 0;
+
+  std::size_t r = 0;
+  std::size_t c = 0;
+  for (auto iter = csr_matrix.begin(); iter != csr_matrix.end(); ++iter) {
+    for (auto iter2 = iter->begin(); iter2 != iter->end(); ++iter2) {
+      auto &&[value, j] = *iter2;
+
+      values[c] = value;
+      colind[c] = j;
+      c++;
+    }
+    assert(r + 1 <= shape[0]);
+    rowptr[r + 1] = c;
+    r++;
+
+    assert(c <= nnz);
+    // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
+  }
+
+  for (; r < shape[0]; r++) {
+    rowptr[r + 1] = nnz;
+  }
+
+  return dr::views::csr_matrix_view(values, rowptr, colind,
+                                    dr::index<I>(shape[0], shape[1]), nnz, 0);
+}
+
+/// Read in the Matrix Market file at location `file_path` and a return
+/// a coo_matrix data structure with its contents.
+template <typename T, typename I = std::size_t>
+inline csr_matrix_base<T, I> read_csr_matrix_base(std::string file_path,
+                                                  bool one_indexed = true) {
+  using size_type = std::size_t;
+
+  std::ifstream f;
+
+  f.open(file_path.c_str());
+
+  if (!f.is_open()) {
+    // TODO better choice of exception.
+    throw std::runtime_error("mmread: cannot open " + file_path);
+  }
+
+  std::string buf;
+
+  // Make sure the file is matrix market matrix, coordinate, and check whether
+  // it is symmetric. If the matrix is symmetric, non-diagonal elements will
+  // be inserted in both (i, j) and (j, i).  Error out if skew-symmetric or
+  // Hermitian.
+  std::getline(f, buf);
+  std::istringstream ss(buf);
+  std::string item;
+  ss >> item;
+  if (item != "%%MatrixMarket") {
+    throw std::runtime_error(file_path +
+                             " could not be parsed as a Matrix Market file.");
+  }
+  ss >> item;
+  if (item != "matrix") {
+    throw std::runtime_error(file_path +
+                             " could not be parsed as a Matrix Market file.");
+  }
+  ss >> item;
+  if (item != "coordinate") {
+    throw std::runtime_error(file_path +
+                             " could not be parsed as a Matrix Market file.");
+  }
+  bool pattern;
+  ss >> item;
+  if (item == "pattern") {
+    pattern = true;
+  } else {
+    pattern = false;
+  }
+  // TODO: do something with real vs. integer vs. pattern?
+  ss >> item;
+  bool symmetric;
+  if (item == "general") {
+    symmetric = false;
+  } else if (item == "symmetric") {
+    symmetric = true;
+  } else {
+    throw std::runtime_error(file_path + " has an unsupported matrix type");
+  }
+
+  bool outOfComments = false;
+  while (!outOfComments) {
+    std::getline(f, buf);
+
+    if (buf[0] != '%') {
+      outOfComments = true;
+    }
+  }
+
+  I m, n, nnz;
+  // std::istringstream ss(buf);
+  ss.clear();
+  ss.str(buf);
+  ss >> m >> n >> nnz;
+
+  // NOTE for symmetric matrices: `nnz` holds the number of stored values in
+  // the matrix market file, while `matrix.nnz_` will hold the total number of
+  // stored values (including "mirrored" symmetric values).
+  csr_matrix_base<T, I> matrix({m, n}, nnz);
+
+  size_type c = 0;
+  while (std::getline(f, buf)) {
+    I i, j;
+    T v;
+    std::istringstream ss(buf);
+    if (!pattern) {
+      ss >> i >> j >> v;
+    } else {
+      ss >> i >> j;
+      v = T(1);
+    }
+    if (one_indexed) {
+      i--;
+      j--;
+    }
+
+    if (i >= m || j >= n) {
+      throw std::runtime_error(
+          "read_MatrixMarket: file has nonzero out of bounds.");
+    }
+
+    matrix.push_back(i, {v, j});
+
+    if (symmetric && i != j) {
+      matrix.push_back(j, {v, i});
+    }
+
+    c++;
+    if (c > nnz) {
+      throw std::runtime_error("read_MatrixMarket: error reading Matrix Market "
+                               "file, file has more nonzeros than reported.");
+    }
+  }
+
+  matrix.sort();
+  f.close();
+
+  return matrix;
+}
+
+template <typename T, typename I, typename Allocator, typename... Args>
+void destroy_csr_matrix_view(dr::views::csr_matrix_view<T, I, Args...> view,
+                             Allocator &&alloc) {
+  alloc.deallocate(view.values_data(), view.size());
+  typename std::allocator_traits<Allocator>::template rebind_alloc<I> i_alloc(
+      alloc);
+  i_alloc.deallocate(view.colind_data(), view.size());
+  i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1);
+}
+
+} // namespace __detail
+
+template <typename T, typename I = std::size_t>
+auto read_csr(std::string file_path, bool one_indexed = true) {
+  auto m = __detail::read_csr_matrix_base<T, I>(file_path, one_indexed);
+  auto shape = m.shape();
+  auto nnz = m.size();
+  auto t =
+      __detail::convert_csr_base_to_csr(m, shape, nnz, std::allocator<T>{});
+
+  return t;
+}
+} // namespace dr
diff --git a/include/dr/detail/multiply_view.hpp b/include/dr/detail/multiply_view.hpp
new file mode 100644
index 0000000000..28d124ba50
--- /dev/null
+++ b/include/dr/detail/multiply_view.hpp
@@ -0,0 +1,144 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <concepts>
+#include <iterator>
+#include <type_traits>
+
+#include <dr/concepts/concepts.hpp>
+#include <dr/detail/ranges_shim.hpp>
+
+namespace dr::__detail {
+
+template <std::random_access_iterator Iter> class multiply_iterator {
+public:
+  using value_type = std::iter_value_t<Iter>;
+  using difference_type = long long;
+  using iterator = multiply_iterator<Iter>;
+  using reference = value_type;
+
+  using pointer = iterator;
+
+  using iterator_category = std::random_access_iterator_tag;
+
+  multiply_iterator(Iter iter, std::size_t len, long long pos) noexcept
+      : iter_(iter), len_(len), pos_(pos) {}
+  multiply_iterator() noexcept = default;
+  ~multiply_iterator() noexcept = default;
+  multiply_iterator(const multiply_iterator &) noexcept = default;
+  multiply_iterator &operator=(const multiply_iterator &) noexcept = default;
+
+  bool operator==(const multiply_iterator &other) const noexcept {
+    return iter_ == other.iter_ && pos_ == other.pos_ && len_ == other.len_;
+  }
+
+  bool operator!=(const multiply_iterator &other) const noexcept {
+    return iter_ != other.iter_ || pos_ != other.pos_ || len_ != other.len_;
+  }
+
+  iterator operator+(difference_type offset) const noexcept {
+    return iterator(iter_, len_, pos_ + offset);
+  }
+
+  iterator operator-(difference_type offset) const noexcept {
+    return iterator(iter_, len_, pos_ + offset);
+  }
+
+  difference_type operator-(iterator other) const noexcept {
+    return pos_ - other.pos_;
+  }
+
+  bool operator<(iterator other) const noexcept { return pos_ < other.pos_; }
+
+  bool operator>(iterator other) const noexcept { return pos_ > other.pos_; }
+
+  bool operator<=(iterator other) const noexcept { return pos_ <= other.pos_; }
+
+  bool operator>=(iterator other) const noexcept { return pos_ >= other.pos_; }
+
+  iterator &operator++() noexcept {
+    ++pos_;
+    return *this;
+  }
+
+  iterator operator++(int) noexcept {
+    iterator other = *this;
+    ++(*this);
+    return other;
+  }
+
+  iterator &operator--() noexcept {
+    --pos_;
+    return *this;
+  }
+
+  iterator operator--(int) noexcept {
+    iterator other = *this;
+    --(*this);
+    return other;
+  }
+
+  iterator &operator+=(difference_type offset) noexcept {
+    pos_ += offset;
+    return *this;
+  }
+
+  iterator &operator-=(difference_type offset) noexcept {
+    pos_ -= offset;
+    return *this;
+  }
+
+  reference operator*() const noexcept { return *(iter_ + (pos_ % len_)); }
+
+  reference operator[](difference_type offset) const noexcept {
+    return *(*this + offset);
+  }
+
+  friend iterator operator+(difference_type n, iterator iter) {
+    return iter.pos_ + n;
+  }
+
+  auto local() const
+    requires(dr::ranges::__detail::has_local<Iter>)
+  {
+    auto iter = dr::ranges::__detail::local(iter_);
+    return multiply_iterator<decltype(iter)>(std::move(iter), len_, pos_);
+  }
+
+private:
+  Iter iter_;
+  std::size_t len_;
+  long long pos_;
+};
+
+template <rng::random_access_range V>
+  requires(rng::sized_range<V>)
+class multiply_view : public rng::view_interface<multiply_view<V>> {
+public:
+  template <rng::viewable_range R>
+  multiply_view(R &&r, std::size_t n)
+      : base_(rng::views::all(std::forward<R>(r))), n_(n) {}
+
+  auto begin() const {
+    return multiply_iterator(rng::begin(base_), base_.size(), 0);
+  }
+
+  auto end() const {
+    return multiply_iterator(rng::begin(base_), base_.size(),
+                             n_ * base_.size());
+  }
+
+  auto size() const { return rng::size(base_); }
+
+private:
+  V base_;
+  std::size_t n_;
+};
+
+template <rng::viewable_range R>
+multiply_view(R &&r, std::size_t n) -> multiply_view<rng::views::all_t<R>>;
+
+} // namespace dr::__detail
diff --git a/include/dr/mp.hpp b/include/dr/mp.hpp
index f9598bbcd8..35c98eb8f7 100644
--- a/include/dr/mp.hpp
+++ b/include/dr/mp.hpp
@@ -48,6 +48,10 @@
 #include <dr/detail/memory.hpp>
 #include <dr/detail/communicator.hpp>
 #include <dr/detail/index.hpp>
+#include <dr/detail/matrix_io.hpp>
+#include <dr/detail/matrix_entry.hpp>
+#include <dr/detail/generate_csr.hpp>
+#include <dr/detail/coo_matrix.hpp>
 
 #include <dr/views/views.hpp>
 #include <dr/views/transform.hpp>
@@ -77,5 +81,11 @@
 #include <dr/mp/algorithms/md_for_each.hpp>
 #include <dr/mp/algorithms/transform.hpp>
 #include <dr/mp/algorithms/transpose.hpp>
+#include <dr/mp/algorithms/matrix/gemv.hpp>
 #include <dr/mp/containers/distributed_vector.hpp>
+#include <dr/mp/containers/distributed_sparse_matrix.hpp>
+#include <dr/mp/containers/broadcasted_vector.hpp>
+#include <dr/mp/containers/broadcasted_slim_matrix.hpp>
 #include <dr/mp/containers/distributed_mdarray.hpp>
+#include <dr/mp/containers/matrix_formats/csr_eq_distribution.hpp>
+#include <dr/mp/containers/matrix_formats/csr_row_distribution.hpp>
diff --git a/include/dr/mp/algorithms/equal.hpp b/include/dr/mp/algorithms/equal.hpp
index 3901a10483..b2b6e278fc 100644
--- a/include/dr/mp/algorithms/equal.hpp
+++ b/include/dr/mp/algorithms/equal.hpp
@@ -26,11 +26,7 @@ bool equal(std::size_t root, bool root_provided, R1 &&r1, R2 &&r2) {
   };
 
   auto zipped_views = views::zip(r1, r2);
-
-  // we are using mp::transform instead of mp::views::transform due to
-  // compilation error refer to DRA-192 and test/gtest/mp/reduce.cpp
-  mp::distributed_vector<int> compared(rng::distance(r1));
-  mp::transform(zipped_views, compared.begin(), compare);
+  auto compared = dr::mp::views::transform(zipped_views, compare);
 
   auto min = [](double x, double y) { return std::min(x, y); };
   if (root_provided) {
diff --git a/include/dr/mp/algorithms/matrix/gemv.hpp b/include/dr/mp/algorithms/matrix/gemv.hpp
new file mode 100644
index 0000000000..14c61b41d2
--- /dev/null
+++ b/include/dr/mp/algorithms/matrix/gemv.hpp
@@ -0,0 +1,73 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+#include <dr/mp/allocator.hpp>
+#include <dr/mp/containers/broadcasted_slim_matrix.hpp>
+#include <dr/mp/containers/broadcasted_vector.hpp>
+#include <dr/mp/containers/distributed_sparse_matrix.hpp>
+#include <dr/mp/global.hpp>
+#include <fmt/core.h>
+#include <ranges>
+
+namespace dr::mp {
+
+template <typename T, typename I, rng::output_range<T> C, typename Alloc,
+          typename Backend, typename MatDistr>
+  requires(vector_multiplicable<MatDistr>)
+void gemv(int root, C &res,
+          distributed_sparse_matrix<T, I, Backend, MatDistr> &a,
+          broadcasted_vector<T, Alloc> b) {
+  if (default_comm().rank() == root) {
+    assert(a.shape().first == res.size());
+    assert(a.shape().second == b.size());
+  }
+  a.local_gemv_and_collect(root, res, b.broadcasted_data(), 1);
+}
+
+template <typename T, typename I, rng::output_range<T> C, typename Alloc,
+          typename Backend, typename MatDistr>
+  requires(vector_multiplicable<MatDistr>)
+void gemv(int root, C &res,
+          distributed_sparse_matrix<T, I, Backend, MatDistr> &a,
+          broadcasted_slim_matrix<T, Alloc> b) {
+  if (default_comm().rank() == root) {
+    assert(a.shape().first * b.width() == res.size());
+  }
+  a.local_gemv_and_collect(root, res, b.broadcasted_data(), b.width());
+}
+
+template <typename T, typename I, rng::output_range<T> C, typename Alloc,
+          typename Backend, typename MatDistr>
+  requires(vector_multiplicable<MatDistr>)
+void gemv(C &res, distributed_sparse_matrix<T, I, Backend, MatDistr> &a,
+          broadcasted_vector<T, Alloc> b) {
+  std::vector<T> workspace(res.size());
+  gemv(0, workspace, a, b);
+  auto tmp = new T[res.size()];
+  if (default_comm().rank() == 0) {
+    std::copy(workspace.begin(), workspace.end(), tmp);
+  }
+  default_comm().bcast(tmp, sizeof(T) * res.size(), 0);
+  std::copy(tmp, tmp + res.size(), res.begin());
+  delete[] tmp;
+}
+
+template <typename T, typename I, rng::output_range<T> C, typename Alloc,
+          typename Backend, typename MatDistr>
+  requires(vector_multiplicable<MatDistr>)
+void gemv(C &res, distributed_sparse_matrix<T, I, Backend, MatDistr> &a,
+          broadcasted_slim_matrix<T, Alloc> b) {
+  std::vector<T> workspace(res.size());
+  gemv(0, workspace, a, b);
+  auto tmp = new T[res.size()];
+  if (default_comm().rank() == 0) {
+    std::copy(workspace.begin(), workspace.end(), tmp);
+  }
+  default_comm().bcast(tmp, sizeof(T) * res.size(), 0);
+  std::copy(tmp, tmp + res.size(), res.begin());
+  delete[] tmp;
+}
+
+} // namespace dr::mp
diff --git a/include/dr/mp/algorithms/reduce.hpp b/include/dr/mp/algorithms/reduce.hpp
index 839dc83a4a..dd2a16dd68 100644
--- a/include/dr/mp/algorithms/reduce.hpp
+++ b/include/dr/mp/algorithms/reduce.hpp
@@ -38,7 +38,14 @@ inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) {
       return std::reduce(dpl_policy(),
                          dr::__detail::direct_iterator(rng::begin(r) + 1),
                          dr::__detail::direct_iterator(rng::end(r)),
-                         sycl_get(*rng::begin(r)), binary_op);
+                         sycl_get_deref(rng::begin(r)), binary_op);
+      // We are not using below code, because we don't want to dereference
+      // rng::begin(r) beyond SYCL environment - the * operator may require
+      // complex operation that relies on GPU memory access (for example
+      // transform view iterator) return std::reduce(dpl_policy(),
+      //                    dr::__detail::direct_iterator(rng::begin(r) + 1),
+      //                    dr::__detail::direct_iterator(rng::end(r)),
+      //                    sycl_get(*rng::begin(r)), binary_op);
     }
   }
 #else
diff --git a/include/dr/mp/containers/broadcasted_slim_matrix.hpp b/include/dr/mp/containers/broadcasted_slim_matrix.hpp
new file mode 100644
index 0000000000..ed16695ae4
--- /dev/null
+++ b/include/dr/mp/containers/broadcasted_slim_matrix.hpp
@@ -0,0 +1,83 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+namespace dr::mp {
+
+template <typename T, typename Allocator = dr::mp::__detail::allocator<T>>
+class broadcasted_slim_matrix {
+public:
+  broadcasted_slim_matrix() = default;
+
+  void broadcast_data(std::size_t height, std::size_t width, std::size_t root,
+                      T **root_data, dr::communicator comm) {
+    if (_data != nullptr) {
+      destroy_data();
+    }
+    _data_size = height * width;
+    _height = height;
+    _width = width;
+    _data = alloc.allocate(_data_size);
+    if (comm.rank() == root) {
+      for (auto i = 0; i < width; i++) {
+        if (use_sycl()) {
+          __detail::sycl_copy(root_data[i], root_data[i] + height,
+                              _data + height * i);
+        } else {
+          rng::copy(root_data[i], root_data[i] + height, _data + height * i);
+        }
+      }
+    }
+    comm.bcast(_data, sizeof(T) * _data_size, root);
+  }
+
+  template <rng::input_range R>
+  void broadcast_data(std::size_t height, std::size_t width, std::size_t root,
+                      R root_data, dr::communicator comm) {
+    if (_data != nullptr) {
+      destroy_data();
+    }
+    _data_size = height * width;
+    _height = height;
+    _width = width;
+    _data = alloc.allocate(_data_size);
+    if (comm.rank() == root) {
+      if (use_sycl()) {
+        __detail::sycl_copy(std::to_address(root_data.begin()),
+                            std::to_address(root_data.end()), _data);
+      } else {
+        rng::copy(root_data.begin(), root_data.end(), _data);
+      }
+    }
+    std::size_t position = 0;
+    std::size_t reminder = sizeof(T) * _data_size;
+    while (reminder > INT_MAX) {
+      comm.bcast(((uint8_t *)_data) + position, INT_MAX, root);
+      position += INT_MAX;
+      reminder -= INT_MAX;
+    }
+    comm.bcast(((uint8_t *)_data) + position, reminder, root);
+  }
+
+  void destroy_data() {
+    alloc.deallocate(_data, _data_size);
+    _data_size = 0;
+    _data = nullptr;
+  }
+
+  T *operator[](std::size_t index) { return _data + _height * index; }
+
+  T *broadcasted_data() { return _data; }
+  auto width() { return _width; }
+
+private:
+  T *_data = nullptr;
+  std::size_t _data_size = 0;
+  std::size_t _width = 0;
+  std::size_t _height = 0;
+
+  Allocator alloc;
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/containers/broadcasted_vector.hpp b/include/dr/mp/containers/broadcasted_vector.hpp
new file mode 100644
index 0000000000..6058067563
--- /dev/null
+++ b/include/dr/mp/containers/broadcasted_vector.hpp
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+namespace dr::mp {
+
+template <typename T, typename Allocator = dr::mp::__detail::allocator<T>>
+class broadcasted_vector {
+public:
+  broadcasted_vector() = default;
+
+  template <rng::input_range R>
+  void broadcast_data(std::size_t data_size, std::size_t root, R root_data,
+                      dr::communicator comm) {
+    if (_data != nullptr) {
+      destroy_data();
+    }
+    _data_size = data_size;
+    _data = alloc.allocate(_data_size);
+    if (comm.rank() == root) {
+      if (use_sycl()) {
+        __detail::sycl_copy(std::to_address(root_data.begin()),
+                            std::to_address(root_data.end()), _data);
+      } else {
+        rng::copy(root_data.begin(), root_data.end(), _data);
+      }
+    }
+    comm.bcast(_data, sizeof(T) * _data_size, root);
+  }
+
+  void destroy_data() {
+    alloc.deallocate(_data, _data_size);
+    _data_size = 0;
+    _data = nullptr;
+  }
+
+  T &operator[](std::size_t index) { return _data[index]; }
+
+  T *broadcasted_data() { return _data; }
+
+  auto size() { return _data_size; }
+
+  auto begin() const { return _data; }
+  auto end() const { return begin() + _data_size; }
+
+private:
+  T *_data = nullptr;
+  std::size_t _data_size = 0;
+  Allocator alloc;
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/containers/distributed_sparse_matrix.hpp b/include/dr/mp/containers/distributed_sparse_matrix.hpp
new file mode 100644
index 0000000000..f03a5d09e6
--- /dev/null
+++ b/include/dr/mp/containers/distributed_sparse_matrix.hpp
@@ -0,0 +1,163 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+#pragma once
+#include <dr/detail/matrix_entry.hpp>
+#include <dr/mp/containers/matrix_formats/csr_eq_distribution.hpp>
+#include <dr/views/csr_matrix_view.hpp>
+
+namespace dr::mp {
+template <typename T>
+concept matrix_distibution = requires(T t, std::vector<int> res, int *input) {
+  { t.fence() } -> std::same_as<void>;
+  { t.segments() } -> rng::random_access_range;
+  { t.shape().first } -> std::convertible_to<std::size_t>;
+  { t.shape().second } -> std::convertible_to<std::size_t>;
+  { t.nnz() } -> std::same_as<std::size_t>;
+  { t.get_segment_from_offset(int()) } -> std::same_as<std::size_t>;
+  { t.get_id_in_segment(int()) } -> std::same_as<std::size_t>;
+  T(dr::views::csr_matrix_view<typename T::elem_type, typename T::index_type>(),
+    distribution());
+};
+
+template <typename T>
+concept vector_multiplicable =
+    requires(T t, std::vector<typename T::elem_type> res, T::elem_type *input) {
+      t.local_gemv_and_collect(int(), res, input, 1);
+    };
+
+template <typename T, typename I, class BackendT = MpiBackend,
+          class MatrixDistrT = csr_eq_distribution<T, I, BackendT>>
+  requires(matrix_distibution<MatrixDistrT>)
+class distributed_sparse_matrix {
+
+public:
+  using value_type = dr::matrix_entry<T, I>;
+  using elem_type = T;
+  using index_type = I;
+  using key_type = dr::index<I>;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using backend_type = BackendT;
+
+  class iterator {
+  public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = typename distributed_sparse_matrix::value_type;
+    using difference_type = typename distributed_sparse_matrix::difference_type;
+
+    iterator() {}
+    iterator(const distributed_sparse_matrix *parent, difference_type offset)
+        : parent_(parent), offset_(offset) {}
+
+    auto operator+(difference_type n) const {
+      return iterator(parent_, offset_ + n);
+    }
+    friend auto operator+(difference_type n, const iterator &other) {
+      return other + n;
+    }
+    auto operator-(difference_type n) const {
+      return iterator(parent_, offset_ - n);
+    }
+    auto operator-(iterator other) const { return offset_ - other.offset_; }
+
+    auto &operator+=(difference_type n) {
+      offset_ += n;
+      return *this;
+    }
+    auto &operator-=(difference_type n) {
+      offset_ -= n;
+      return *this;
+    }
+    auto &operator++() {
+      offset_++;
+      return *this;
+    }
+    auto operator++(int) {
+      auto old = *this;
+      offset_++;
+      return old;
+    }
+    auto &operator--() {
+      offset_--;
+      return *this;
+    }
+    auto operator--(int) {
+      auto old = *this;
+      offset_--;
+      return old;
+    }
+
+    bool operator==(iterator other) const {
+      if (parent_ == nullptr || other.parent_ == nullptr) {
+        return false;
+      } else {
+        return offset_ == other.offset_;
+      }
+    }
+    auto operator<=>(iterator other) const {
+      assert(parent_ == other.parent_);
+      return offset_ <=> other.offset_;
+    }
+
+    auto operator*() const {
+      auto segment_id = parent_->distribution_.get_segment_from_offset(offset_);
+      auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_);
+      return parent_->segments()[segment_id][id_in_segment];
+    }
+    auto operator[](difference_type n) const { return *(*this + n); }
+
+    auto local() {
+      auto segment_id = parent_->distribution_.get_segment_from_offset(offset_);
+      auto id_in_segment = parent_->distribution_.get_id_in_segment(offset_);
+      return (parent_->segments()[segment_id].begin() + id_in_segment).local();
+    }
+
+    auto segments() {
+      return dr::__detail::drop_segments(parent_->segments(), offset_);
+    }
+
+  private:
+    const distributed_sparse_matrix *parent_ = nullptr;
+    difference_type offset_;
+  };
+
+  distributed_sparse_matrix(const distributed_sparse_matrix &) = delete;
+  distributed_sparse_matrix &
+  operator=(const distributed_sparse_matrix &) = delete;
+  distributed_sparse_matrix(distributed_sparse_matrix &&) { assert(false); }
+
+  /// Constructor
+  distributed_sparse_matrix(dr::views::csr_matrix_view<T, I> csr_view,
+                            std::size_t root = 0,
+                            distribution dist = distribution())
+      : distribution_(csr_view, dist, root) {}
+
+  /// Returns iterator to beginning
+  auto begin() const { return iterator(this, 0); }
+  /// Returns iterator to end
+  auto end() const { return begin() + distribution_.nnz(); }
+
+  /// Returns size
+  auto size() const { return distribution_.nnz(); }
+
+  auto shape() const { return distribution_.shape(); }
+  /// Returns reference using index
+  auto operator[](difference_type n) const { return *(begin() + n); }
+  //   auto &halo() const { return *halo_; } TODO
+
+  auto segments() const { return distribution_.segments(); }
+
+  void fence() { distribution_.fence(); }
+
+  template <typename C>
+    requires(vector_multiplicable<MatrixDistrT>)
+  auto local_gemv_and_collect(std::size_t root, C &res, T *vals,
+                              std::size_t val_width) const {
+    distribution_.local_gemv_and_collect(root, res, vals, val_width);
+  }
+
+private:
+  MatrixDistrT distribution_;
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp
index 2611963064..d63c4c084f 100644
--- a/include/dr/mp/containers/distributed_vector.hpp
+++ b/include/dr/mp/containers/distributed_vector.hpp
@@ -63,8 +63,20 @@ class MpiBackend {
 
 #if (MPI_VERSION >= 4) ||                                                      \
     (defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))
-    // 64-bit API inside
-    win_.put(src, datalen, segment_index, offset);
+    if (mp::use_sycl()) {
+      // 32-bit API inside for sycl based buffers
+      for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) {
+        std::size_t s = std::min(remainder, (std::size_t)INT_MAX);
+        DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}",
+              default_comm().rank(), __LINE__, s, off, offset + off);
+        win_.put((uint8_t *)src + off, s, segment_index, offset + off);
+        off += s;
+        remainder -= s;
+      }
+    } else {
+      // 64-bit API inside
+      win_.put(src, datalen, segment_index, offset);
+    }
 #else
     for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) {
       std::size_t s = std::min(remainder, (std::size_t)INT_MAX);
@@ -77,9 +89,9 @@ class MpiBackend {
 #endif
   }
 
-  std::size_t getrank() { return win_.communicator().rank(); }
+  std::size_t getrank() const { return win_.communicator().rank(); }
 
-  void fence() { win_.fence(); }
+  void fence() const { win_.fence(); }
 };
 
 #ifdef DRISHMEM
@@ -121,13 +133,13 @@ class IshmemBackend {
     ishmem_putmem(dst, src, datalen, segment_index);
   }
 
-  std::size_t getrank() {
+  std::size_t getrank() const {
     auto my_process_segment_index = ishmem_my_pe();
     DRLOG("called ishmem_my_pe() -> {}", my_process_segment_index);
     return my_process_segment_index;
   }
 
-  void fence() {
+  void fence() const {
     // TODO: to have locality use ishmemx_fence_work_group
     ishmem_fence();
   }
@@ -276,6 +288,12 @@ template <typename T, class BackendT = MpiBackend> class distributed_vector {
 
   void fence() { backend.fence(); }
 
+  auto segment_size() const { return segment_size_; }
+
+  auto get_segment_offset(std::size_t segment_id) const {
+    return segment_id * segment_size_;
+  }
+
 private:
   void init(auto size, auto dist) {
     size_ = size;
diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp
new file mode 100644
index 0000000000..aeb0461115
--- /dev/null
+++ b/include/dr/mp/containers/matrix_formats/csr_eq_distribution.hpp
@@ -0,0 +1,390 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+#pragma once
+#include <dr/detail/matrix_entry.hpp>
+#include <dr/detail/multiply_view.hpp>
+#include <dr/mp/containers/matrix_formats/csr_eq_segment.hpp>
+#include <dr/views/csr_matrix_view.hpp>
+
+namespace dr::mp {
+
+template <typename T, typename I, class BackendT = MpiBackend>
+class csr_eq_distribution {
+  using view_tuple = std::tuple<std::size_t, std::size_t, std::size_t, I *>;
+
+public:
+  using value_type = dr::matrix_entry<T, I>;
+  using segment_type = csr_eq_segment<csr_eq_distribution>;
+  using elem_type = T;
+  using index_type = I;
+  using difference_type = std::ptrdiff_t;
+
+  csr_eq_distribution(const csr_eq_distribution &) = delete;
+  csr_eq_distribution &operator=(const csr_eq_distribution &) = delete;
+  csr_eq_distribution(csr_eq_distribution &&) { assert(false); }
+
+  csr_eq_distribution(dr::views::csr_matrix_view<T, I> csr_view,
+                      distribution dist = distribution(),
+                      std::size_t root = 0) {
+    init(csr_view, dist, root);
+  }
+
+  ~csr_eq_distribution() {
+    if (!finalized()) {
+      fence();
+      if (rows_data_ != nullptr) {
+        rows_backend_.deallocate(rows_data_, row_size_ * sizeof(index_type));
+        tuple_alloc.deallocate(view_helper_const, 1);
+      }
+    }
+  }
+  std::size_t get_id_in_segment(std::size_t offset) const {
+    return offset % segment_size_;
+  }
+  std::size_t get_segment_from_offset(std::size_t offset) const {
+    return offset / segment_size_;
+  }
+  auto segments() const { return rng::views::all(segments_); }
+  auto nnz() const { return nnz_; }
+  auto shape() const { return shape_; }
+  void fence() const { rows_backend_.fence(); }
+
+  template <typename C>
+  auto local_gemv(C &res, T *vals, std::size_t vals_width) const {
+    auto rank = rows_backend_.getrank();
+    if (nnz_ <= segment_size_ * rank) {
+      return;
+    }
+    auto vals_len = shape_[1];
+    auto size = row_sizes_[rank];
+    auto res_col_len = row_sizes_[default_comm().rank()];
+    if (dr::mp::use_sycl()) {
+      auto localVals = dr::__detail::direct_iterator(
+          dr::mp::local_segment(*vals_data_).begin());
+      auto localCols = dr::__detail::direct_iterator(
+          dr::mp::local_segment(*cols_data_).begin());
+      auto offset = rank * segment_size_;
+      auto real_segment_size =
+          std::min(nnz_ - rank * segment_size_, segment_size_);
+      auto local_data = rows_data_;
+      auto division = std::max(1ul, real_segment_size / 50);
+      auto one_computation_size = (real_segment_size + division - 1) / division;
+      auto row_size = row_size_;
+      dr::__detail::parallel_for_workaround(
+          dr::mp::sycl_queue(), sycl::range<1>{division},
+          [=](auto idx) {
+            std::size_t lower_bound = one_computation_size * idx;
+            std::size_t upper_bound =
+                std::min(one_computation_size * (idx + 1), real_segment_size);
+            std::size_t position = lower_bound + offset;
+            std::size_t first_row = rng::distance(
+                local_data,
+                std::upper_bound(local_data, local_data + row_size, position) -
+                    1);
+            for (auto j = 0; j < vals_width; j++) {
+              auto row = first_row;
+              T sum = 0;
+
+              for (auto i = lower_bound; i < upper_bound; i++) {
+                while (row + 1 < row_size &&
+                       local_data[row + 1] <= offset + i) {
+                  sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                                   sycl::memory_scope::device>
+                      c_ref(res[row + j * res_col_len]);
+                  c_ref += sum;
+                  row++;
+                  sum = 0;
+                }
+                auto colNum = localCols[i] + j * vals_len;
+                auto matrixVal = vals[colNum];
+                auto vectorVal = localVals[i];
+
+                sum += matrixVal * vectorVal;
+              }
+              sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                               sycl::memory_scope::device>
+                  c_ref(res[row + j * res_col_len]);
+              c_ref += sum;
+            }
+          })
+          .wait();
+    } else {
+      auto row_i = -1;
+      auto position = segment_size_ * rank;
+      auto elem_count = std::min(segment_size_, nnz_ - segment_size_ * rank);
+      auto current_row_position = rows_data_[0];
+      auto local_vals = dr::mp::local_segment(*vals_data_);
+      auto local_cols = dr::mp::local_segment(*cols_data_);
+
+      for (int i = 0; i < elem_count; i++) {
+        while (row_i + 1 < size && position + i >= current_row_position) {
+          row_i++;
+          current_row_position = rows_data_[row_i + 1];
+        }
+        for (int j = 0; j < vals_width; j++) {
+          res[row_i + j * res_col_len] +=
+              local_vals[i] * vals[local_cols[i] + j * vals_len];
+        }
+      }
+    }
+  }
+
+  template <typename C>
+  auto local_gemv_and_collect(std::size_t root, C &res, T *vals,
+                              std::size_t vals_width) const {
+    assert(res.size() == shape_.first * vals_width);
+    __detail::allocator<T> alloc;
+    auto res_alloc =
+        alloc.allocate(row_sizes_[default_comm().rank()] * vals_width);
+    if (use_sycl()) {
+      sycl_queue()
+          .fill(res_alloc, 0, row_sizes_[default_comm().rank()] * vals_width)
+          .wait();
+    } else {
+      std::fill(res_alloc,
+                res_alloc + row_sizes_[default_comm().rank()] * vals_width, 0);
+    }
+
+    local_gemv(res_alloc, vals, vals_width);
+    gather_gemv_vector(root, res, res_alloc, vals_width);
+    fence();
+    alloc.deallocate(res_alloc, row_sizes_[default_comm().rank()] * vals_width);
+  }
+
+private:
+  friend csr_eq_segment_iterator<csr_eq_distribution>;
+
+  template <typename C, typename A>
+  void gather_gemv_vector(std::size_t root, C &res, A &partial_res,
+                          std::size_t vals_width) const {
+    auto communicator = default_comm();
+    __detail::allocator<T> alloc;
+    long long *counts = new long long[communicator.size()];
+    for (auto i = 0; i < communicator.size(); i++) {
+      counts[i] = row_sizes_[i] * sizeof(T) * vals_width;
+    }
+
+    if (communicator.rank() == root) {
+      long *offsets = new long[communicator.size()];
+      offsets[0] = 0;
+      for (auto i = 0; i < communicator.size() - 1; i++) {
+        offsets[i + 1] = offsets[i] + counts[i];
+      }
+      auto gathered_res = alloc.allocate(max_row_size_ * vals_width);
+      communicator.gatherv(partial_res, counts, offsets, gathered_res, root);
+      T *gathered_res_host;
+
+      if (use_sycl()) {
+        gathered_res_host = new T[max_row_size_ * vals_width];
+        __detail::sycl_copy(gathered_res, gathered_res_host,
+                            max_row_size_ * vals_width);
+      } else {
+        gathered_res_host = gathered_res;
+      }
+      rng::fill(res, 0);
+
+      for (auto k = 0; k < vals_width; k++) {
+        auto current_offset = 0;
+        for (auto i = 0; i < communicator.size(); i++) {
+          auto first_row = row_offsets_[i];
+          auto last_row = row_offsets_[i] + row_sizes_[i];
+          auto row_size = row_sizes_[i];
+          if (first_row < last_row) {
+            res[first_row + k * shape_[0]] +=
+                gathered_res_host[vals_width * current_offset + k * row_size];
+          }
+          if (first_row < last_row - 1) {
+            auto piece_start = gathered_res_host + vals_width * current_offset +
+                               k * row_size + 1;
+            std::copy(piece_start, piece_start + last_row - first_row - 1,
+                      res.begin() + first_row + k * shape_[0] + 1);
+          }
+          current_offset += row_sizes_[i];
+        }
+      }
+
+      if (use_sycl()) {
+        delete[] gathered_res_host;
+      }
+      delete[] offsets;
+      alloc.deallocate(gathered_res, max_row_size_ * vals_width);
+    } else {
+      communicator.gatherv(partial_res, counts, nullptr, nullptr, root);
+    }
+    delete[] counts;
+  }
+
+  std::size_t get_row_size(std::size_t rank) { return row_sizes_[rank]; }
+
+  void init(dr::views::csr_matrix_view<T, I> csr_view, auto dist,
+            std::size_t root) {
+    distribution_ = dist;
+    auto rank = rows_backend_.getrank();
+
+    std::size_t initial_data[3];
+    if (root == rank) {
+      initial_data[0] = csr_view.size();
+      initial_data[1] = csr_view.shape().first;
+      initial_data[2] = csr_view.shape().second;
+      default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root);
+    } else {
+      default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root);
+    }
+
+    nnz_ = initial_data[0];
+    shape_ = {initial_data[1], initial_data[2]};
+    vals_data_ = std::make_shared<distributed_vector<T>>(nnz_);
+    cols_data_ = std::make_shared<distributed_vector<I>>(nnz_);
+    dr::mp::copy(root,
+                 std::ranges::subrange(csr_view.values_data(),
+                                       csr_view.values_data() + nnz_),
+                 vals_data_->begin());
+    dr::mp::copy(root,
+                 std::ranges::subrange(csr_view.colind_data(),
+                                       csr_view.colind_data() + nnz_),
+                 cols_data_->begin());
+
+    auto row_info_size = default_comm().size() * 2 + 1;
+    __detail::allocator<size_t> alloc;
+    std::size_t *row_information = new std::size_t[row_info_size];
+    row_offsets_.reserve(default_comm().size());
+    row_sizes_.reserve(default_comm().size());
+    if (root == default_comm().rank()) {
+      for (int i = 0; i < default_comm().size(); i++) {
+        auto first_index = vals_data_->get_segment_offset(i);
+        auto last_index = vals_data_->get_segment_offset(i + 1) - 1;
+        auto lower_limit =
+            rng::distance(csr_view.rowptr_data(),
+                          std::upper_bound(csr_view.rowptr_data(),
+                                           csr_view.rowptr_data() + shape_[0],
+                                           first_index)) -
+            1;
+        auto higher_limit = rng::distance(
+            csr_view.rowptr_data(),
+            std::upper_bound(csr_view.rowptr_data(),
+                             csr_view.rowptr_data() + shape_[0], last_index));
+        row_offsets_.push_back(lower_limit);
+        row_sizes_.push_back(higher_limit - lower_limit);
+        row_information[i] = lower_limit;
+        row_information[default_comm().size() + i] = higher_limit - lower_limit;
+        max_row_size_ = max_row_size_ + row_sizes_.back();
+      }
+      row_information[default_comm().size() * 2] = max_row_size_;
+      default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size,
+                           root);
+    } else {
+      default_comm().bcast(row_information, sizeof(std::size_t) * row_info_size,
+                           root);
+      for (int i = 0; i < default_comm().size(); i++) {
+        row_offsets_.push_back(row_information[i]);
+        row_sizes_.push_back(row_information[default_comm().size() + i]);
+      }
+      max_row_size_ = row_information[default_comm().size() * 2];
+    }
+    delete[] row_information;
+    row_size_ = std::max(row_sizes_[rank], static_cast<std::size_t>(1));
+    rows_data_ =
+        static_cast<I *>(rows_backend_.allocate(row_size_ * sizeof(I)));
+
+    fence();
+    if (rank == root) {
+      for (std::size_t i = 0; i < default_comm().size(); i++) {
+        auto lower_limit = row_offsets_[i];
+        auto row_size = row_sizes_[i];
+        if (row_size > 0) {
+          rows_backend_.putmem(csr_view.rowptr_data() + lower_limit, 0,
+                               row_size * sizeof(I), i);
+        }
+      }
+    }
+
+    std::size_t segment_index = 0;
+    segment_size_ = vals_data_->segment_size();
+    assert(segment_size_ == cols_data_->segment_size());
+    for (std::size_t i = 0; i < nnz_; i += segment_size_) {
+      segments_.emplace_back(this, segment_index++,
+                             std::min(segment_size_, nnz_ - i), segment_size_);
+    }
+    auto local_rows = rows_data_;
+    auto real_val_size = std::min(vals_data_->segment_size(),
+                                  nnz_ - vals_data_->segment_size() * rank);
+    auto my_tuple = std::make_tuple(row_size_, row_offsets_[rank],
+                                    segment_size_ * rank, local_rows);
+    view_helper_const = tuple_alloc.allocate(1);
+
+    if (use_sycl()) {
+      sycl_queue()
+          .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple))
+          .wait();
+    } else {
+      view_helper_const[0] = my_tuple;
+    }
+
+    auto local_cols = static_cast<I *>(nullptr);
+    auto local_vals = static_cast<T *>(nullptr);
+    if (cols_data_->segments().size() > rank) {
+      local_cols = cols_data_->segments()[rank].begin().local();
+      local_vals = vals_data_->segments()[rank].begin().local();
+      local_view = std::make_shared<view_type>(get_elem_view(
+          real_val_size, view_helper_const, local_cols, local_vals, rank));
+    }
+    fence();
+  }
+
+  static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple,
+                            index_type *local_cols, elem_type *local_vals,
+                            std::size_t rank) {
+    auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size);
+    auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size);
+    auto zipped_results = rng::views::zip(local_vals_range, local_cols_range);
+    auto enumerated_zipped = rng::views::enumerate(zipped_results);
+    // we need to use multiply_view here,
+    // because lambda is not properly copied to sycl environment
+    // when we use variable capture
+    auto multiply_range = dr::__detail::multiply_view(
+        rng::subrange(helper_tuple, helper_tuple + 1), vals_size);
+    auto enumerted_with_data =
+        rng::views::zip(enumerated_zipped, multiply_range);
+
+    auto transformer = [=](auto x) {
+      auto [entry, tuple] = x;
+      auto [row_size, row_offset, offset, local_rows] = tuple;
+      auto [index, pair] = entry;
+      auto [val, column] = pair;
+      auto row =
+          rng::distance(local_rows,
+                        std::upper_bound(local_rows, local_rows + row_size,
+                                         offset + index) -
+                            1) +
+          row_offset;
+      dr::index<index_type> index_obj(row, column);
+      value_type entry_obj(index_obj, val);
+      return entry_obj;
+    };
+    return rng::transform_view(enumerted_with_data, std::move(transformer));
+  }
+
+  using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0));
+
+  dr::mp::__detail::allocator<view_tuple> tuple_alloc;
+  view_tuple *view_helper_const;
+  std::shared_ptr<view_type> local_view = nullptr;
+
+  std::size_t segment_size_ = 0;
+  std::size_t row_size_ = 0;
+  std::size_t max_row_size_ = 0;
+  std::vector<std::size_t> row_offsets_;
+  std::vector<std::size_t> row_sizes_;
+
+  index_type *rows_data_ = nullptr;
+  BackendT rows_backend_;
+
+  distribution distribution_;
+  dr::index<size_t> shape_;
+  std::size_t nnz_;
+  std::vector<segment_type> segments_;
+  std::shared_ptr<distributed_vector<T>> vals_data_;
+  std::shared_ptr<distributed_vector<I>> cols_data_;
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp
new file mode 100644
index 0000000000..2caa8af843
--- /dev/null
+++ b/include/dr/mp/containers/matrix_formats/csr_eq_segment.hpp
@@ -0,0 +1,304 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+namespace dr::mp {
+
+template <typename DSM> class csr_eq_segment_iterator;
+
+template <typename DSM> class csr_eq_segment_reference {
+  using iterator = csr_eq_segment_iterator<DSM>;
+
+public:
+  using value_type = typename DSM::value_type;
+  using index_type = typename DSM::index_type;
+  using elem_type = typename DSM::elem_type;
+
+  csr_eq_segment_reference(const iterator it) : iterator_(it) {}
+
+  operator value_type() const { return iterator_.get(); }
+  operator std::pair<std::pair<index_type, index_type>, elem_type>() const {
+    return iterator_.get();
+  }
+
+  template <std::size_t Index> auto get() const noexcept {
+    if constexpr (Index == 0) {
+      return iterator_.get_index();
+    }
+    if constexpr (Index == 1) {
+      return iterator_.get_value();
+    }
+  }
+
+  auto operator=(const value_type &value) const {
+    iterator_.put(value);
+    return *this;
+  }
+  auto operator=(const csr_eq_segment_reference &other) const {
+    *this = value_type(other);
+    return *this;
+  }
+  auto operator&() const { return iterator_; }
+
+private:
+  const iterator iterator_;
+}; // csr_eq_segment_reference
+
+template <typename DSM> class csr_eq_segment_iterator {
+public:
+  using value_type = typename DSM::value_type;
+  using index_type = typename DSM::index_type;
+  using elem_type = typename DSM::elem_type;
+  using difference_type = typename DSM::difference_type;
+
+  csr_eq_segment_iterator() = default;
+  csr_eq_segment_iterator(DSM *dsm, std::size_t segment_index,
+                          std::size_t index) {
+    dsm_ = dsm;
+    segment_index_ = segment_index;
+    index_ = index;
+  }
+
+  auto operator<=>(const csr_eq_segment_iterator &other) const noexcept {
+    // assertion below checks against compare dereferenceable iterator to a
+    // singular iterator and against attempt to compare iterators from different
+    // sequences like _Safe_iterator<gnu_cxx::normal_iterator> does
+    assert(dsm_ == other.dsm_);
+    return segment_index_ == other.segment_index_
+               ? index_ <=> other.index_
+               : segment_index_ <=> other.segment_index_;
+  }
+
+  // Comparison
+  bool operator==(const csr_eq_segment_iterator &other) const noexcept {
+    return (*this <=> other) == 0;
+  }
+
+  // Only this arithmetic manipulate internal state
+  auto &operator+=(difference_type n) {
+    assert(dsm_ != nullptr);
+    assert(n >= 0 || static_cast<difference_type>(index_) >= -n);
+    index_ += n;
+    return *this;
+  }
+
+  auto &operator-=(difference_type n) { return *this += (-n); }
+
+  difference_type
+  operator-(const csr_eq_segment_iterator &other) const noexcept {
+    assert(dsm_ != nullptr && dsm_ == other.dsm_);
+    assert(index_ >= other.index_);
+    return index_ - other.index_;
+  }
+
+  // prefix
+  auto &operator++() {
+    *this += 1;
+    return *this;
+  }
+  auto &operator--() {
+    *this -= 1;
+    return *this;
+  }
+
+  // postfix
+  auto operator++(int) {
+    auto prev = *this;
+    *this += 1;
+    return prev;
+  }
+  auto operator--(int) {
+    auto prev = *this;
+    *this -= 1;
+    return prev;
+  }
+
+  auto operator+(difference_type n) const {
+    auto p = *this;
+    p += n;
+    return p;
+  }
+  auto operator-(difference_type n) const {
+    auto p = *this;
+    p -= n;
+    return p;
+  }
+
+  // When *this is not first in the expression
+  friend auto operator+(difference_type n,
+                        const csr_eq_segment_iterator &other) {
+    return other + n;
+  }
+
+  // dereference
+  auto operator*() const {
+    assert(dsm_ != nullptr);
+    return csr_eq_segment_reference<DSM>{*this};
+  }
+  auto operator[](difference_type n) const {
+    assert(dsm_ != nullptr);
+    return *(*this + n);
+  }
+
+  void get(value_type *dst, std::size_t size) const {
+    auto elems = new elem_type[size];
+    auto indexes = new dr::index<index_type>[size];
+    get_value(elems, size);
+    get_index(indexes, size);
+    for (std::size_t i = 0; i < size; i++) {
+      *(dst + i) = {indexes[i], elems[i]};
+    }
+  }
+
+  value_type get() const {
+    value_type val;
+    get(&val, 1);
+    return val;
+  }
+
+  void get_value(elem_type *dst, std::size_t size) const {
+    assert(dsm_ != nullptr);
+    assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_);
+    (dsm_->vals_data_->segments()[segment_index_].begin() + index_)
+        .get(dst, size);
+  }
+
+  elem_type get_value() const {
+    elem_type val;
+    get_value(&val, 1);
+    return val;
+  }
+
+  void get_index(dr::index<index_type> *dst, std::size_t size) const {
+    assert(dsm_ != nullptr);
+    assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_);
+    auto col_data = new index_type[size];
+    (dsm_->cols_data_->segments()[segment_index_].begin() + index_)
+        .get(col_data, size);
+    index_type *rows;
+    std::size_t rows_length = dsm_->get_row_size(segment_index_);
+
+    if (rank() == dsm_->rows_backend_.getrank()) {
+      rows = dsm_->rows_data_;
+    } else {
+      rows = new index_type[rows_length];
+      dsm_->rows_backend_.getmem(rows, 0, rows_length * sizeof(index_type),
+                                 segment_index_);
+    }
+    auto position =
+        dsm_->cols_data_->get_segment_offset(segment_index_) + index_;
+    auto rows_iter = rows + 1;
+    auto cols_iter = col_data;
+    auto iter = dst;
+    std::size_t current_row = dsm_->row_offsets_[segment_index_];
+    std::size_t last_row = current_row + rows_length - 1;
+    for (int i = 0; i < size; i++) {
+      while (current_row < last_row && *rows_iter <= position + i) {
+        rows_iter++;
+        current_row++;
+      }
+      iter->first = current_row;
+      iter->second = *cols_iter;
+      cols_iter++;
+      iter++;
+    }
+    if (rank() != dsm_->rows_backend_.getrank()) {
+      delete[] rows;
+    }
+    delete[] col_data;
+  }
+
+  dr::index<index_type> get_index() const {
+    dr::index<index_type> val;
+    get_index(&val, 1);
+    return val;
+  }
+
+  void put(const value_type *dst, std::size_t size) const {
+    assert(dsm_ != nullptr);
+    assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_);
+    (dsm_->vals_data_->segments()[segment_index_].begin() + index_)
+        .put(dst, size);
+  }
+
+  void put(const value_type &value) const { put(&value, 1); }
+
+  auto rank() const {
+    assert(dsm_ != nullptr);
+    return segment_index_;
+  }
+
+  auto segments() const {
+    assert(dsm_ != nullptr);
+    return dr::__detail::drop_segments(dsm_->segments(), segment_index_,
+                                       index_);
+  }
+
+  auto local() const {
+    const auto my_process_segment_index = dsm_->rows_backend_.getrank();
+    assert(my_process_segment_index == segment_index_);
+    if (dsm_->local_view == nullptr) {
+      throw std::runtime_error("Requesting not existing local segment");
+    }
+    return dsm_->local_view->begin();
+  }
+
+private:
+  // all fields need to be initialized by default ctor so every default
+  // constructed iter is equal to any other default constructed iter
+  DSM *dsm_ = nullptr;
+  std::size_t segment_index_ = 0;
+  std::size_t index_ = 0;
+}; // csr_eq_segment_iterator
+
+template <typename DSM> class csr_eq_segment {
+private:
+  using iterator = csr_eq_segment_iterator<DSM>;
+
+public:
+  using difference_type = std::ptrdiff_t;
+  csr_eq_segment() = default;
+  csr_eq_segment(DSM *dsm, std::size_t segment_index, std::size_t size,
+                 std::size_t reserved) {
+    dsm_ = dsm;
+    segment_index_ = segment_index;
+    size_ = size;
+    reserved_ = reserved;
+    assert(dsm_ != nullptr);
+  }
+
+  auto size() const {
+    assert(dsm_ != nullptr);
+    return size_;
+  }
+
+  auto begin() const { return iterator(dsm_, segment_index_, 0); }
+  auto end() const { return begin() + size(); }
+  auto reserved() const { return reserved_; }
+
+  auto operator[](difference_type n) const { return *(begin() + n); }
+
+  bool is_local() const { return segment_index_ == default_comm().rank(); }
+
+private:
+  DSM *dsm_ = nullptr;
+  std::size_t segment_index_;
+  std::size_t size_;
+  std::size_t reserved_;
+}; // csr_eq_segment
+
+} // namespace dr::mp
+
+namespace std {
+template <typename DSM>
+struct tuple_size<dr::mp::csr_eq_segment_reference<DSM>>
+    : std::integral_constant<std::size_t, 2> {};
+
+template <std::size_t Index, typename DSM>
+struct tuple_element<Index, dr::mp::csr_eq_segment_reference<DSM>>
+    : tuple_element<Index, std::tuple<dr::index<typename DSM::index_type>,
+                                      typename DSM::elem_type>> {};
+
+} // namespace std
diff --git a/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp
new file mode 100644
index 0000000000..70fd7f02d7
--- /dev/null
+++ b/include/dr/mp/containers/matrix_formats/csr_row_distribution.hpp
@@ -0,0 +1,376 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+#pragma once
+#include <dr/detail/matrix_entry.hpp>
+#include <dr/detail/multiply_view.hpp>
+#include <dr/mp/containers/matrix_formats/csr_row_segment.hpp>
+#include <dr/views/csr_matrix_view.hpp>
+#include <fmt/core.h>
+
+namespace dr::mp {
+template <typename T, typename I, class BackendT = MpiBackend>
+class csr_row_distribution {
+  using view_tuple = std::tuple<std::size_t, std::size_t, std::size_t, I *>;
+
+public:
+  using value_type = dr::matrix_entry<T, I>;
+  using segment_type = csr_row_segment<csr_row_distribution>;
+  using elem_type = T;
+  using index_type = I;
+  using difference_type = std::ptrdiff_t;
+
+  csr_row_distribution(const csr_row_distribution &) = delete;
+  csr_row_distribution &operator=(const csr_row_distribution &) = delete;
+  csr_row_distribution(csr_row_distribution &&) { assert(false); }
+
+  csr_row_distribution(dr::views::csr_matrix_view<T, I> csr_view,
+                       distribution dist = distribution(),
+                       std::size_t root = 0) {
+    init(csr_view, dist, root);
+  }
+
+  ~csr_row_distribution() {
+    if (!finalized()) {
+      fence();
+      if (vals_data_ != nullptr) {
+        vals_backend_.deallocate(vals_data_, vals_size_ * sizeof(index_type));
+        cols_backend_.deallocate(cols_data_, vals_size_ * sizeof(index_type));
+        alloc.deallocate(view_helper_const, 1);
+      }
+    }
+  }
+  std::size_t get_id_in_segment(std::size_t offset) const {
+    assert(offset < nnz_);
+    auto pos_iter =
+        std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset) - 1;
+    return offset - *pos_iter;
+  }
+  std::size_t get_segment_from_offset(std::size_t offset) const {
+    assert(offset < nnz_);
+    auto pos_iter =
+        std::upper_bound(val_offsets_.begin(), val_offsets_.end(), offset);
+    return rng::distance(val_offsets_.begin(), pos_iter) - 1;
+  }
+  auto segments() const { return rng::views::all(segments_); }
+  auto nnz() const { return nnz_; }
+  auto shape() const { return shape_; }
+  void fence() const {
+    vals_backend_.fence();
+    cols_backend_.fence();
+  }
+  template <typename C>
+  auto local_gemv(C &res, T *vals, std::size_t vals_width) const {
+    auto rank = cols_backend_.getrank();
+    if (shape_[0] <= segment_size_ * rank)
+      return;
+    auto size = std::min(segment_size_, shape_[0] - segment_size_ * rank);
+    auto vals_len = shape_[1];
+    if (dr::mp::use_sycl()) {
+      auto local_vals = vals_data_;
+      auto local_cols = cols_data_;
+      auto offset = val_offsets_[rank];
+      auto real_segment_size = std::min(nnz_ - offset, val_sizes_[rank]);
+      auto rows_data = dr::__detail::direct_iterator(
+          dr::mp::local_segment(*rows_data_).begin());
+      auto res_col_len = segment_size_;
+      std::size_t wg = 32;
+      while (vals_width * size * wg > INT_MAX) {
+        // this check is necessary, because sycl does not permit ranges
+        // exceeding integer limit
+        wg /= 2;
+      }
+      assert(wg > 0);
+      dr::mp::sycl_queue()
+          .submit([&](auto &&h) {
+            h.parallel_for(
+                sycl::nd_range<1>(vals_width * size * wg, wg), [=](auto item) {
+                  auto input_j = item.get_group(0) / size;
+                  auto idx = item.get_group(0) % size;
+                  auto local_id = item.get_local_id();
+                  auto group_size = item.get_local_range(0);
+                  std::size_t lower_bound = 0;
+                  if (rows_data[idx] > offset) {
+                    lower_bound = rows_data[idx] - offset;
+                  }
+                  std::size_t upper_bound = real_segment_size;
+                  if (idx < size - 1) {
+                    upper_bound = rows_data[idx + 1] - offset;
+                  }
+                  T sum = 0;
+                  for (auto i = lower_bound + local_id; i < upper_bound;
+                       i += group_size) {
+                    auto colNum = local_cols[i];
+                    auto matrixVal = vals[colNum + input_j * vals_len];
+                    auto vectorVal = local_vals[i];
+                    sum += matrixVal * vectorVal;
+                  }
+
+                  sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                                   sycl::memory_scope::device>
+                      c_ref(res[idx + input_j * res_col_len]);
+                  c_ref += sum;
+                });
+          })
+          .wait();
+    } else {
+      auto local_rows = dr::mp::local_segment(*rows_data_);
+      auto val_count = val_sizes_[rank];
+      auto row_i = 0;
+      auto position = val_offsets_[rank];
+      auto current_row_position = local_rows[1];
+
+      for (int i = 0; i < val_count; i++) {
+        while (row_i + 1 < size && position + i >= current_row_position) {
+          row_i++;
+          current_row_position = local_rows[row_i + 1];
+        }
+        for (auto j = 0; j < vals_width; j++) {
+          res[row_i + j * segment_size_] +=
+              vals_data_[i] * vals[cols_data_[i] + j * vals_len];
+        }
+      }
+    }
+  }
+
+  template <typename C>
+  auto local_gemv_and_collect(std::size_t root, C &res, T *&vals,
+                              std::size_t vals_width) const {
+    assert(res.size() == shape_.first * vals_width);
+    __detail::allocator<T> alloc;
+    auto res_alloc = alloc.allocate(segment_size_ * vals_width);
+    if (use_sycl()) {
+      sycl_queue().fill(res_alloc, 0, segment_size_ * vals_width).wait();
+    } else {
+      std::fill(res_alloc, res_alloc + segment_size_ * vals_width, 0);
+    }
+
+    local_gemv(res_alloc, vals, vals_width);
+
+    gather_gemv_vector(root, res, res_alloc, vals_width);
+    fence();
+    alloc.deallocate(res_alloc, segment_size_ * vals_width);
+  }
+
+private:
+  friend csr_row_segment_iterator<csr_row_distribution>;
+
+  template <typename C, typename A>
+  void gather_gemv_vector(std::size_t root, C &res, A &partial_res,
+                          std::size_t vals_width) const {
+    auto communicator = default_comm();
+    __detail::allocator<T> alloc;
+
+    if (communicator.rank() == root) {
+      auto scratch =
+          alloc.allocate(segment_size_ * communicator.size() * vals_width);
+      communicator.gather(partial_res, scratch, segment_size_ * vals_width,
+                          root);
+      T *temp = nullptr;
+      if (use_sycl()) {
+        temp = new T[res.size()];
+      }
+      for (auto j = 0; j < communicator.size(); j++) {
+        if (j * segment_size_ >= shape_.first) {
+          break;
+        }
+        auto comm_segment_size =
+            std::min(segment_size_, shape_.first - j * segment_size_);
+
+        for (auto i = 0; i < vals_width; i++) {
+          auto piece_start =
+              scratch + j * vals_width * segment_size_ + i * segment_size_;
+
+          if (use_sycl()) {
+            __detail::sycl_copy(piece_start,
+                                temp + shape_.first * i + j * segment_size_,
+                                comm_segment_size);
+          } else {
+            std::copy(piece_start, piece_start + comm_segment_size,
+                      res.begin() + shape_.first * i + j * segment_size_);
+          }
+        }
+      }
+      if (use_sycl()) {
+        std::copy(temp, temp + res.size(), res.begin());
+        delete[] temp;
+      }
+      alloc.deallocate(scratch,
+                       segment_size_ * communicator.size() * vals_width);
+    } else {
+      communicator.gather(partial_res, static_cast<T *>(nullptr),
+                          segment_size_ * vals_width, root);
+    }
+  }
+  void init(dr::views::csr_matrix_view<T, I> csr_view, auto dist,
+            std::size_t root) {
+    distribution_ = dist;
+    auto rank = vals_backend_.getrank();
+
+    std::size_t initial_data[3];
+    if (root == rank) {
+      initial_data[0] = csr_view.size();
+      initial_data[1] = csr_view.shape().first;
+      initial_data[2] = csr_view.shape().second;
+      default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root);
+    } else {
+      default_comm().bcast(initial_data, sizeof(std::size_t) * 3, root);
+    }
+
+    nnz_ = initial_data[0];
+    shape_ = {initial_data[1], initial_data[2]};
+
+    rows_data_ = std::make_shared<distributed_vector<I>>(shape_.first);
+
+    dr::mp::copy(root,
+                 std::ranges::subrange(csr_view.rowptr_data(),
+                                       csr_view.rowptr_data() + shape_.first),
+                 rows_data_->begin());
+
+    auto row_info_size = default_comm().size() * 2;
+    std::size_t *val_information = new std::size_t[row_info_size];
+    val_offsets_.reserve(row_info_size);
+    val_sizes_.reserve(row_info_size);
+    if (rank == root) {
+      for (int i = 0; i < default_comm().size(); i++) {
+        auto first_index = rows_data_->get_segment_offset(i);
+        if (first_index > shape_.first) {
+          val_offsets_.push_back(nnz_);
+          val_sizes_.push_back(0);
+          continue;
+        }
+        std::size_t lower_limit = csr_view.rowptr_data()[first_index];
+        std::size_t higher_limit = nnz_;
+        if (rows_data_->get_segment_offset(i + 1) < shape_.first) {
+          auto last_index = rows_data_->get_segment_offset(i + 1);
+          higher_limit = csr_view.rowptr_data()[last_index];
+        }
+        val_offsets_.push_back(lower_limit);
+        val_sizes_.push_back(higher_limit - lower_limit);
+        val_information[i] = lower_limit;
+        val_information[i + default_comm().size()] = higher_limit - lower_limit;
+      }
+      default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size,
+                           root);
+    } else {
+      default_comm().bcast(val_information, sizeof(std::size_t) * row_info_size,
+                           root);
+      for (int i = 0; i < default_comm().size(); i++) {
+        val_offsets_.push_back(val_information[i]);
+        val_sizes_.push_back(val_information[default_comm().size() + i]);
+      }
+    }
+    delete[] val_information;
+    vals_size_ = std::max(val_sizes_[rank], static_cast<std::size_t>(1));
+
+    cols_data_ =
+        static_cast<I *>(cols_backend_.allocate(vals_size_ * sizeof(I)));
+    vals_data_ =
+        static_cast<T *>(vals_backend_.allocate(vals_size_ * sizeof(T)));
+
+    fence();
+    if (rank == root) {
+      for (std::size_t i = 0; i < default_comm().size(); i++) {
+        auto lower_limit = val_offsets_[i];
+        auto row_size = val_sizes_[i];
+        if (row_size > 0) {
+          vals_backend_.putmem(csr_view.values_data() + lower_limit, 0,
+                               row_size * sizeof(T), i);
+          cols_backend_.putmem(csr_view.colind_data() + lower_limit, 0,
+                               row_size * sizeof(I), i);
+        }
+      }
+    }
+
+    std::size_t segment_index = 0;
+    segment_size_ = rows_data_->segment_size();
+    for (std::size_t i = 0; i < default_comm().size(); i++) {
+      segments_.emplace_back(
+          this, segment_index++, val_sizes_[i],
+          std::max(val_sizes_[i], static_cast<std::size_t>(1)));
+    }
+
+    auto local_rows = static_cast<I *>(nullptr);
+    if (rows_data_->segments().size() > rank) {
+      local_rows = rows_data_->segments()[rank].begin().local();
+    }
+    auto offset = val_offsets_[rank];
+    auto real_row_size =
+        std::min(rows_data_->segment_size(),
+                 shape_.first - rows_data_->segment_size() * rank);
+    auto my_tuple = std::make_tuple(real_row_size, segment_size_ * rank, offset,
+                                    local_rows);
+    view_helper_const = alloc.allocate(1);
+
+    if (use_sycl()) {
+      sycl_queue()
+          .memcpy(view_helper_const, &my_tuple, sizeof(view_tuple))
+          .wait();
+    } else {
+      view_helper_const[0] = my_tuple;
+    }
+
+    if (rows_data_->segments().size() > rank) {
+      local_view = std::make_shared<view_type>(get_elem_view(
+          vals_size_, view_helper_const, cols_data_, vals_data_, rank));
+    }
+    fence();
+  }
+
+  static auto get_elem_view(std::size_t vals_size, view_tuple *helper_tuple,
+                            index_type *local_cols, elem_type *local_vals,
+                            std::size_t rank) {
+    auto local_vals_range = rng::subrange(local_vals, local_vals + vals_size);
+    auto local_cols_range = rng::subrange(local_cols, local_cols + vals_size);
+    auto zipped_results = rng::views::zip(local_vals_range, local_cols_range);
+    auto enumerated_zipped = rng::views::enumerate(zipped_results);
+    // we need to use multiply_view here,
+    // because lambda is not properly copied to sycl environment
+    // when we use variable capture
+    auto multiply_range = dr::__detail::multiply_view(
+        rng::subrange(helper_tuple, helper_tuple + 1), vals_size);
+    auto enumerted_with_data =
+        rng::views::zip(enumerated_zipped, multiply_range);
+
+    auto transformer = [=](auto x) {
+      auto [entry, tuple] = x;
+      auto [row_size, row_offset, offset, local_rows] = tuple;
+      auto [index, pair] = entry;
+      auto [val, column] = pair;
+      auto row =
+          rng::distance(local_rows,
+                        std::upper_bound(local_rows, local_rows + row_size,
+                                         offset + index) -
+                            1) +
+          row_offset;
+      dr::index<index_type> index_obj(row, column);
+      value_type entry_obj(index_obj, val);
+      return entry_obj;
+    };
+    return rng::transform_view(enumerted_with_data, std::move(transformer));
+  }
+
+  using view_type = decltype(get_elem_view(0, nullptr, nullptr, nullptr, 0));
+
+  dr::mp::__detail::allocator<view_tuple> alloc;
+  view_tuple *view_helper_const;
+  std::shared_ptr<view_type> local_view;
+
+  std::size_t segment_size_ = 0;
+  std::size_t vals_size_ = 0;
+  std::vector<std::size_t> val_offsets_;
+  std::vector<std::size_t> val_sizes_;
+
+  index_type *cols_data_ = nullptr;
+  BackendT cols_backend_;
+
+  elem_type *vals_data_ = nullptr;
+  BackendT vals_backend_;
+
+  distribution distribution_;
+  dr::index<size_t> shape_;
+  std::size_t nnz_;
+  std::vector<segment_type> segments_;
+  std::shared_ptr<distributed_vector<I>> rows_data_ = nullptr;
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp
new file mode 100644
index 0000000000..ce0f627e3e
--- /dev/null
+++ b/include/dr/mp/containers/matrix_formats/csr_row_segment.hpp
@@ -0,0 +1,292 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+namespace dr::mp {
+template <typename DSM> class csr_row_segment_iterator;
+
+template <typename DSM> class csr_row_segment_reference {
+  using iterator = csr_row_segment_iterator<DSM>;
+
+public:
+  using value_type = typename DSM::value_type;
+  using index_type = typename DSM::index_type;
+  using elem_type = typename DSM::elem_type;
+
+  csr_row_segment_reference(const iterator it) : iterator_(it) {}
+
+  operator value_type() const { return iterator_.get(); }
+  operator std::pair<std::pair<index_type, index_type>, elem_type>() const {
+    return iterator_.get();
+  }
+
+  template <std::size_t Index> auto get() const noexcept {
+    if constexpr (Index == 0) {
+      return iterator_.get_index();
+    }
+    if constexpr (Index == 1) {
+      return iterator_.get_value();
+    }
+  }
+
+  auto operator=(const csr_row_segment_reference &other) const {
+    *this = value_type(other);
+    return *this;
+  }
+  auto operator&() const { return iterator_; }
+
+private:
+  const iterator iterator_;
+}; // csr_row_segment_reference
+
+template <typename DSM> class csr_row_segment_iterator {
+public:
+  using value_type = typename DSM::value_type;
+  using index_type = typename DSM::index_type;
+  using elem_type = typename DSM::elem_type;
+  using difference_type = typename DSM::difference_type;
+
+  csr_row_segment_iterator() = default;
+  csr_row_segment_iterator(DSM *dsm, std::size_t segment_index,
+                           std::size_t index) {
+    dsm_ = dsm;
+    segment_index_ = segment_index;
+    index_ = index;
+  }
+
+  auto operator<=>(const csr_row_segment_iterator &other) const noexcept {
+    // assertion below checks against compare dereferenceable iterator to a
+    // singular iterator and against attempt to compare iterators from different
+    // sequences like _Safe_iterator<gnu_cxx::normal_iterator> does
+    assert(dsm_ == other.dsm_);
+    return segment_index_ == other.segment_index_
+               ? index_ <=> other.index_
+               : segment_index_ <=> other.segment_index_;
+  }
+
+  // Comparison
+  bool operator==(const csr_row_segment_iterator &other) const noexcept {
+    return (*this <=> other) == 0;
+  }
+
+  // Only this arithmetic manipulate internal state
+  auto &operator+=(difference_type n) {
+    assert(dsm_ != nullptr);
+    assert(n >= 0 || static_cast<difference_type>(index_) >= -n);
+    index_ += n;
+    return *this;
+  }
+
+  auto &operator-=(difference_type n) { return *this += (-n); }
+
+  difference_type
+  operator-(const csr_row_segment_iterator &other) const noexcept {
+    assert(dsm_ != nullptr && dsm_ == other.dsm_);
+    assert(index_ >= other.index_);
+    return index_ - other.index_;
+  }
+
+  // prefix
+  auto &operator++() {
+    *this += 1;
+    return *this;
+  }
+  auto &operator--() {
+    *this -= 1;
+    return *this;
+  }
+
+  // postfix
+  auto operator++(int) {
+    auto prev = *this;
+    *this += 1;
+    return prev;
+  }
+  auto operator--(int) {
+    auto prev = *this;
+    *this -= 1;
+    return prev;
+  }
+
+  auto operator+(difference_type n) const {
+    auto p = *this;
+    p += n;
+    return p;
+  }
+  auto operator-(difference_type n) const {
+    auto p = *this;
+    p -= n;
+    return p;
+  }
+
+  // When *this is not first in the expression
+  friend auto operator+(difference_type n,
+                        const csr_row_segment_iterator &other) {
+    return other + n;
+  }
+
+  // dereference
+  auto operator*() const {
+    assert(dsm_ != nullptr);
+    return csr_row_segment_reference<DSM>{*this};
+  }
+  auto operator[](difference_type n) const {
+    assert(dsm_ != nullptr);
+    return *(*this + n);
+  }
+
+  void get(value_type *dst, std::size_t size) const {
+    auto elems = new elem_type[size];
+    auto indexes = new dr::index<index_type>[size];
+    get_value(elems, size);
+    get_index(indexes, size);
+    for (std::size_t i = 0; i < size; i++) {
+      *(dst + i) = {indexes[i], elems[i]};
+    }
+  }
+
+  value_type get() const {
+    value_type val;
+    get(&val, 1);
+    return val;
+  }
+
+  void get_value(elem_type *dst, std::size_t size) const {
+    assert(dsm_ != nullptr);
+    assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_);
+    dsm_->vals_backend_.getmem(dst, index_ * sizeof(elem_type),
+                               size * sizeof(elem_type), segment_index_);
+  }
+
+  elem_type get_value() const {
+    elem_type val;
+    get_value(&val, 1);
+    return val;
+  }
+
+  void get_index(dr::index<index_type> *dst, std::size_t size) const {
+    assert(dsm_ != nullptr);
+    assert(segment_index_ * dsm_->segment_size_ + index_ < dsm_->nnz_);
+    index_type *col_data;
+    if (rank() == dsm_->cols_backend_.getrank()) {
+      col_data = dsm_->cols_data_ + index_;
+    } else {
+      col_data = new index_type[size];
+      dsm_->cols_backend_.getmem(col_data, index_ * sizeof(index_type),
+                                 size * sizeof(index_type), segment_index_);
+    }
+    index_type *rows;
+    std::size_t rows_length = dsm_->segment_size_;
+    rows = new index_type[rows_length];
+    (dsm_->rows_data_->segments()[segment_index_].begin())
+        .get(rows, rows_length);
+
+    auto position = dsm_->val_offsets_[segment_index_] + index_;
+    auto rows_iter = rows + 1;
+    index_type *cols_iter = col_data;
+    auto iter = dst;
+    std::size_t current_row = dsm_->segment_size_ * segment_index_;
+    std::size_t last_row =
+        std::min(current_row + rows_length - 1, dsm_->shape_[0] - 1);
+
+    for (int i = 0; i < size; i++) {
+      while (current_row < last_row && *rows_iter <= position + i) {
+        rows_iter++;
+        current_row++;
+      }
+      iter->first = current_row;
+      iter->second = *cols_iter;
+      cols_iter++;
+      iter++;
+    }
+    if (rank() != dsm_->cols_backend_.getrank()) {
+      delete[] col_data;
+    }
+    delete[] rows;
+  }
+
+  dr::index<index_type> get_index() const {
+    dr::index<index_type> val;
+    get_index(&val, 1);
+    return val;
+  }
+
+  auto rank() const {
+    assert(dsm_ != nullptr);
+    return segment_index_;
+  }
+
+  auto segments() const {
+    assert(dsm_ != nullptr);
+    return dr::__detail::drop_segments(dsm_->segments(), segment_index_,
+                                       index_);
+  }
+
+  auto local() const {
+    const auto my_process_segment_index = dsm_->vals_backend_.getrank();
+    assert(my_process_segment_index == segment_index_);
+    if (dsm_->local_view == nullptr) {
+      throw std::runtime_error("Requesting not existing local segment");
+    }
+    return dsm_->local_view->begin();
+  }
+
+private:
+  // all fields need to be initialized by default ctor so every default
+  // constructed iter is equal to any other default constructed iter
+  DSM *dsm_ = nullptr;
+  std::size_t segment_index_ = 0;
+  std::size_t index_ = 0;
+}; // csr_row_segment_iterator
+
+template <typename DSM> class csr_row_segment {
+private:
+  using iterator = csr_row_segment_iterator<DSM>;
+
+public:
+  using difference_type = std::ptrdiff_t;
+  csr_row_segment() = default;
+  csr_row_segment(DSM *dsm, std::size_t segment_index, std::size_t size,
+                  std::size_t reserved) {
+    dsm_ = dsm;
+    segment_index_ = segment_index;
+    size_ = size;
+    reserved_ = reserved;
+    assert(dsm_ != nullptr);
+  }
+
+  auto size() const {
+    assert(dsm_ != nullptr);
+    return size_;
+  }
+
+  auto begin() const { return iterator(dsm_, segment_index_, 0); }
+  auto end() const { return begin() + size(); }
+  auto reserved() const { return reserved_; }
+
+  auto operator[](difference_type n) const { return *(begin() + n); }
+
+  bool is_local() const { return segment_index_ == default_comm().rank(); }
+
+private:
+  DSM *dsm_ = nullptr;
+  std::size_t segment_index_;
+  std::size_t size_;
+  std::size_t reserved_;
+}; // csr_row_segment
+
+} // namespace dr::mp
+
+namespace std {
+template <typename DSM>
+struct tuple_size<dr::mp::csr_row_segment_reference<DSM>>
+    : std::integral_constant<std::size_t, 2> {};
+
+template <std::size_t Index, typename DSM>
+struct tuple_element<Index, dr::mp::csr_row_segment_reference<DSM>>
+    : tuple_element<Index, std::tuple<dr::index<typename DSM::index_type>,
+                                      typename DSM::elem_type>> {};
+
+} // namespace std
diff --git a/include/dr/mp/sycl_support.hpp b/include/dr/mp/sycl_support.hpp
index 791999fe5c..33d34159a8 100644
--- a/include/dr/mp/sycl_support.hpp
+++ b/include/dr/mp/sycl_support.hpp
@@ -17,6 +17,22 @@ sycl::queue &sycl_queue();
 
 namespace dr::mp::__detail {
 
+// sometimes we only want to dereference iterator inside SYCL
+template <typename T> auto sycl_get_deref(T v) {
+  using deref_type = std::remove_reference<decltype(*v)>::type;
+  deref_type temp;
+  {
+    sycl::buffer<deref_type> buff(&temp, 1);
+    sycl_queue()
+        .submit([&](auto &&h) {
+          sycl::accessor access(buff, h, sycl::write_only, sycl::no_init);
+          h.single_task([=](auto i) { access[0] = *v; });
+        })
+        .wait();
+  }
+  return temp;
+}
+
 template <typename T> T sycl_get(T &v) {
   T temp;
   sycl_queue().memcpy(&temp, &v, sizeof(v)).wait();
diff --git a/include/dr/sp/algorithms/matrix/local_gemv.hpp b/include/dr/sp/algorithms/matrix/local_gemv.hpp
index 302b14c789..7c3c32baa5 100644
--- a/include/dr/sp/algorithms/matrix/local_gemv.hpp
+++ b/include/dr/sp/algorithms/matrix/local_gemv.hpp
@@ -19,8 +19,9 @@ namespace __detail {
 template <typename T, typename I, std::random_access_iterator Iter,
           typename... Args>
   requires(std::is_same_v<std::iter_value_t<Iter>, T>)
-auto custom_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b,
-                 Iter c, const std::vector<sycl::event> &dependencies = {}) {
+auto custom_gemv(sycl::queue &q, dr::views::csr_matrix_view<T, I, Args...> a,
+                 Iter b, Iter c,
+                 const std::vector<sycl::event> &dependencies = {}) {
   std::size_t wg = 32;
 
   auto event = q.submit([&](auto &&h) {
@@ -55,7 +56,8 @@ auto custom_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b,
 template <typename T, typename I, std::random_access_iterator Iter,
           typename... Args>
   requires(std::is_same_v<std::iter_value_t<Iter>, T>)
-auto mkl_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b, Iter c,
+auto mkl_gemv(sycl::queue &q, dr::views::csr_matrix_view<T, I, Args...> a,
+              Iter b, Iter c,
               const std::vector<sycl::event> &dependencies = {}) {
 
   oneapi::mkl::sparse::matrix_handle_t a_handle;
@@ -78,8 +80,9 @@ auto mkl_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b, Iter c,
 template <typename T, typename I, std::random_access_iterator Iter,
           typename... Args>
   requires(std::is_same_v<std::iter_value_t<Iter>, T>)
-auto local_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b,
-                Iter c, const std::vector<sycl::event> &dependencies = {}) {
+auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view<T, I, Args...> a,
+                Iter b, Iter c,
+                const std::vector<sycl::event> &dependencies = {}) {
   return mkl_gemv(q, a, b, c, dependencies);
 }
 
@@ -88,8 +91,9 @@ auto local_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b,
 template <typename T, typename I, std::random_access_iterator Iter,
           typename... Args>
   requires(std::is_same_v<std::iter_value_t<Iter>, T>)
-auto local_gemv(sycl::queue &q, csr_matrix_view<T, I, Args...> a, Iter b,
-                Iter c, const std::vector<sycl::event> &dependencies = {}) {
+auto local_gemv(sycl::queue &q, dr::views::csr_matrix_view<T, I, Args...> a,
+                Iter b, Iter c,
+                const std::vector<sycl::event> &dependencies = {}) {
   return custom_gemv(q, a, b, c, dependencies);
 }
 
diff --git a/include/dr/sp/containers/distributed_dense_matrix.hpp b/include/dr/sp/containers/distributed_dense_matrix.hpp
index de6b1fa332..0c53534cbf 100644
--- a/include/dr/sp/containers/distributed_dense_matrix.hpp
+++ b/include/dr/sp/containers/distributed_dense_matrix.hpp
@@ -7,8 +7,8 @@
 #include <memory>
 
 #include <dr/detail/index.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <dr/detail/owning_view.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
 #include <dr/sp/containers/matrix_partition.hpp>
 #include <dr/sp/containers/sequential/dense_matrix.hpp>
 #include <dr/sp/device_vector.hpp>
@@ -25,9 +25,9 @@ template <typename T, typename L> class distributed_dense_matrix_accessor {
   using scalar_value_type = rng::range_value_t<L>;
   using scalar_reference = rng::range_reference_t<L>;
 
-  using value_type = dr::sp::matrix_entry<scalar_value_type, std::size_t>;
+  using value_type = dr::matrix_entry<scalar_value_type, std::size_t>;
 
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using iterator_category = std::random_access_iterator_tag;
 
@@ -138,15 +138,15 @@ template <typename T> class distributed_dense_matrix {
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
 
-  using value_type = dr::sp::matrix_entry<T>;
+  using value_type = dr::matrix_entry<T>;
 
   using scalar_reference = rng::range_reference_t<
       dr::sp::device_vector<T, dr::sp::device_allocator<T>>>;
   using const_scalar_reference = rng::range_reference_t<
       const dr::sp::device_vector<T, dr::sp::device_allocator<T>>>;
 
-  using reference = dr::sp::matrix_ref<T, scalar_reference>;
-  using const_reference = dr::sp::matrix_ref<const T, const_scalar_reference>;
+  using reference = dr::matrix_ref<T, scalar_reference>;
+  using const_reference = dr::matrix_ref<const T, const_scalar_reference>;
 
   using key_type = dr::index<>;
 
diff --git a/include/dr/sp/containers/sequential/dense_matrix.hpp b/include/dr/sp/containers/sequential/dense_matrix.hpp
index 5e4ccb8723..58d77c7ecd 100644
--- a/include/dr/sp/containers/sequential/dense_matrix.hpp
+++ b/include/dr/sp/containers/sequential/dense_matrix.hpp
@@ -8,7 +8,7 @@
 
 #include <dr/detail/index.hpp>
 #include <dr/detail/iterator_adaptor.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <dr/sp/views/dense_column_view.hpp>
 #include <dr/sp/views/dense_matrix_iterator.hpp>
 #include <dr/sp/views/dense_row_view.hpp>
@@ -26,7 +26,7 @@ class dense_matrix {
   using scalar_pointer = typename std::allocator_traits<Allocator>::pointer;
 
   using scalar_reference = std::iter_reference_t<scalar_pointer>;
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using key_type = dr::index<>;
   using map_type = T;
diff --git a/include/dr/sp/containers/sparse_matrix.hpp b/include/dr/sp/containers/sparse_matrix.hpp
index a8184633cb..83adb869c0 100644
--- a/include/dr/sp/containers/sparse_matrix.hpp
+++ b/include/dr/sp/containers/sparse_matrix.hpp
@@ -4,15 +4,15 @@
 
 #pragma once
 
+#include <dr/detail/generate_csr.hpp>
 #include <dr/detail/index.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <dr/sp/algorithms/copy.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
 #include <dr/sp/containers/matrix_partition.hpp>
 #include <dr/sp/device_vector.hpp>
 #include <dr/sp/distributed_span.hpp>
 #include <dr/sp/init.hpp>
-#include <dr/sp/util/generate_random.hpp>
-#include <dr/sp/views/csr_matrix_view.hpp>
+#include <dr/views/csr_matrix_view.hpp>
 #include <iterator>
 
 namespace dr::sp {
@@ -137,19 +137,19 @@ template <typename T, std::integral I = std::int64_t> class sparse_matrix {
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
 
-  using value_type = dr::sp::matrix_entry<T>;
+  using value_type = dr::matrix_entry<T>;
 
   using scalar_reference = rng::range_reference_t<
       dr::sp::device_vector<T, dr::sp::device_allocator<T>>>;
   using const_scalar_reference = rng::range_reference_t<
       const dr::sp::device_vector<T, dr::sp::device_allocator<T>>>;
 
-  using reference = dr::sp::matrix_ref<T, scalar_reference>;
-  using const_reference = dr::sp::matrix_ref<const T, const_scalar_reference>;
+  using reference = dr::matrix_ref<T, scalar_reference>;
+  using const_reference = dr::matrix_ref<const T, const_scalar_reference>;
 
   using key_type = dr::index<I>;
 
-  using segment_type = dr::sp::csr_matrix_view<
+  using segment_type = dr::views::csr_matrix_view<
       T, I,
       rng::iterator_t<dr::sp::device_vector<T, dr::sp::device_allocator<T>>>,
       rng::iterator_t<dr::sp::device_vector<I, dr::sp::device_allocator<I>>>>;
@@ -210,7 +210,7 @@ template <typename T, std::integral I = std::int64_t> class sparse_matrix {
   // in `gemv_benchmark`.  I believe this is a SYCL bug.
   template <typename... Args>
   auto copy_tile_async(key_type tile_index,
-                       csr_matrix_view<T, I, Args...> tile_view) {
+                       dr::views::csr_matrix_view<T, I, Args...> tile_view) {
     std::size_t tile_idx = tile_index[0] * grid_shape_[1] + tile_index[1];
     auto &&values = values_[tile_idx];
     auto &&colind = colind_[tile_idx];
@@ -250,7 +250,7 @@ template <typename T, std::integral I = std::int64_t> class sparse_matrix {
 
   template <typename... Args>
   void copy_tile(key_type tile_index,
-                 csr_matrix_view<T, I, Args...> tile_view) {
+                 dr::views::csr_matrix_view<T, I, Args...> tile_view) {
     copy_tile_async(tile_index, tile_view).wait();
   }
 
diff --git a/include/dr/sp/util/generate_random.hpp b/include/dr/sp/util/generate_random.hpp
deleted file mode 100644
index a284ce927a..0000000000
--- a/include/dr/sp/util/generate_random.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-// SPDX-FileCopyrightText: Intel Corporation
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#pragma once
-
-#include <concepts>
-#include <dr/sp/views/csr_matrix_view.hpp>
-#include <map>
-#include <random>
-
-namespace dr::sp {
-
-namespace {
-
-template <typename T> struct uniform_distribution {
-  using type = std::uniform_int_distribution<T>;
-};
-
-template <std::floating_point T> struct uniform_distribution<T> {
-  using type = std::uniform_real_distribution<T>;
-};
-
-template <typename T>
-using uniform_distribution_t = typename uniform_distribution<T>::type;
-
-} // namespace
-
-template <typename T = float, std::integral I = std::size_t>
-auto generate_random_csr(dr::index<I> shape, double density = 0.01,
-                         unsigned int seed = 0) {
-
-  assert(density >= 0.0 && density < 1.0);
-
-  std::map<std::pair<I, I>, T> tuples;
-
-  std::size_t nnz = density * shape[0] * shape[1];
-
-  std::mt19937 gen(seed);
-  std::uniform_int_distribution<I> row(0, shape[0] - 1);
-  std::uniform_int_distribution<I> column(0, shape[1] - 1);
-
-  uniform_distribution_t<T> value_gen(0, 1);
-
-  while (tuples.size() < nnz) {
-    auto i = row(gen);
-    auto j = column(gen);
-    if (tuples.find({i, j}) == tuples.end()) {
-      T value = value_gen(gen);
-      tuples.insert({{i, j}, value});
-    }
-  }
-
-  T *values = new T[nnz];
-  I *rowptr = new I[shape[0] + 1];
-  I *colind = new I[nnz];
-
-  rowptr[0] = 0;
-
-  std::size_t r = 0;
-  std::size_t c = 0;
-  for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) {
-    auto &&[index, value] = *iter;
-    auto &&[i, j] = index;
-
-    values[c] = value;
-    colind[c] = j;
-
-    while (r < i) {
-      if (r + 1 > shape[0]) {
-        // TODO: exception?
-        // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
-      }
-      rowptr[r + 1] = c;
-      r++;
-    }
-    c++;
-
-    if (c > nnz) {
-      // TODO: exception?
-      // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
-    }
-  }
-
-  for (; r < shape[0]; r++) {
-    rowptr[r + 1] = nnz;
-  }
-
-  return csr_matrix_view(values, rowptr, colind, shape, nnz, 0);
-}
-
-} // namespace dr::sp
diff --git a/include/dr/sp/util/matrix_io.hpp b/include/dr/sp/util/matrix_io.hpp
index d34fd132f7..21561b52c4 100644
--- a/include/dr/sp/util/matrix_io.hpp
+++ b/include/dr/sp/util/matrix_io.hpp
@@ -12,220 +12,17 @@
 #include <tuple>
 #include <vector>
 
-#include <dr/sp/util/coo_matrix.hpp>
-#include <dr/sp/views/csr_matrix_view.hpp>
+#include <dr/detail/matrix_io.hpp>
+#include <dr/views/csr_matrix_view.hpp>
 
 namespace dr::sp {
 
-namespace __detail {
-
-// Preconditions:
-// 1) `tuples` sorted by row, column
-// 2) `tuples` has shape `shape`
-// 3) `tuples` has `nnz` elements
-template <typename Tuples, typename Allocator>
-auto convert_to_csr(Tuples &&tuples, dr::index<> shape, std::size_t nnz,
-                    Allocator &&allocator) {
-  auto &&[index, v] = *tuples.begin();
-  auto &&[i, j] = index;
-
-  using T = std::remove_reference_t<decltype(v)>;
-  using I = std::remove_reference_t<decltype(i)>;
-
-  typename std::allocator_traits<Allocator>::template rebind_alloc<I>
-      i_allocator(allocator);
-
-  T *values = allocator.allocate(nnz);
-  I *rowptr = i_allocator.allocate(shape[0] + 1);
-  I *colind = i_allocator.allocate(nnz);
-
-  rowptr[0] = 0;
-
-  std::size_t r = 0;
-  std::size_t c = 0;
-  for (auto iter = tuples.begin(); iter != tuples.end(); ++iter) {
-    auto &&[index, value] = *iter;
-    auto &&[i, j] = index;
-
-    values[c] = value;
-    colind[c] = j;
-
-    while (r < i) {
-      assert(r + 1 <= shape[0]);
-      // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
-      rowptr[r + 1] = c;
-      r++;
-    }
-    c++;
-
-    assert(c <= nnz);
-    // throw std::runtime_error("csr_matrix_impl_: given invalid matrix");
-  }
-
-  for (; r < shape[0]; r++) {
-    rowptr[r + 1] = nnz;
-  }
-
-  return csr_matrix_view(values, rowptr, colind,
-                         dr::index<I>(shape[0], shape[1]), nnz, 0);
-}
-
-/// Read in the Matrix Market file at location `file_path` and a return
-/// a coo_matrix data structure with its contents.
-template <typename T, typename I = std::size_t>
-inline coo_matrix<T, I> mmread(std::string file_path, bool one_indexed = true) {
-  using size_type = std::size_t;
-
-  std::ifstream f;
-
-  f.open(file_path.c_str());
-
-  if (!f.is_open()) {
-    // TODO better choice of exception.
-    throw std::runtime_error("mmread: cannot open " + file_path);
-  }
-
-  std::string buf;
-
-  // Make sure the file is matrix market matrix, coordinate, and check whether
-  // it is symmetric. If the matrix is symmetric, non-diagonal elements will
-  // be inserted in both (i, j) and (j, i).  Error out if skew-symmetric or
-  // Hermitian.
-  std::getline(f, buf);
-  std::istringstream ss(buf);
-  std::string item;
-  ss >> item;
-  if (item != "%%MatrixMarket") {
-    throw std::runtime_error(file_path +
-                             " could not be parsed as a Matrix Market file.");
-  }
-  ss >> item;
-  if (item != "matrix") {
-    throw std::runtime_error(file_path +
-                             " could not be parsed as a Matrix Market file.");
-  }
-  ss >> item;
-  if (item != "coordinate") {
-    throw std::runtime_error(file_path +
-                             " could not be parsed as a Matrix Market file.");
-  }
-  bool pattern;
-  ss >> item;
-  if (item == "pattern") {
-    pattern = true;
-  } else {
-    pattern = false;
-  }
-  // TODO: do something with real vs. integer vs. pattern?
-  ss >> item;
-  bool symmetric;
-  if (item == "general") {
-    symmetric = false;
-  } else if (item == "symmetric") {
-    symmetric = true;
-  } else {
-    throw std::runtime_error(file_path + " has an unsupported matrix type");
-  }
-
-  bool outOfComments = false;
-  while (!outOfComments) {
-    std::getline(f, buf);
-
-    if (buf[0] != '%') {
-      outOfComments = true;
-    }
-  }
-
-  I m, n, nnz;
-  // std::istringstream ss(buf);
-  ss.clear();
-  ss.str(buf);
-  ss >> m >> n >> nnz;
-
-  // NOTE for symmetric matrices: `nnz` holds the number of stored values in
-  // the matrix market file, while `matrix.nnz_` will hold the total number of
-  // stored values (including "mirrored" symmetric values).
-  coo_matrix<T, I> matrix({m, n});
-  if (symmetric) {
-    matrix.reserve(2 * nnz);
-  } else {
-    matrix.reserve(nnz);
-  }
-
-  size_type c = 0;
-  while (std::getline(f, buf)) {
-    I i, j;
-    T v;
-    std::istringstream ss(buf);
-    if (!pattern) {
-      ss >> i >> j >> v;
-    } else {
-      ss >> i >> j;
-      v = T(1);
-    }
-    if (one_indexed) {
-      i--;
-      j--;
-    }
-
-    if (i >= m || j >= n) {
-      throw std::runtime_error(
-          "read_MatrixMarket: file has nonzero out of bounds.");
-    }
-
-    matrix.push_back({{i, j}, v});
-
-    if (symmetric && i != j) {
-      matrix.push_back({{j, i}, v});
-    }
-
-    c++;
-    if (c > nnz) {
-      throw std::runtime_error("read_MatrixMarket: error reading Matrix Market "
-                               "file, file has more nonzeros than reported.");
-    }
-  }
-
-  auto sort_fn = [](const auto &a, const auto &b) {
-    auto &&[a_index, a_value] = a;
-    auto &&[b_index, b_value] = b;
-    auto &&[a_i, a_j] = a_index;
-    auto &&[b_i, b_j] = b_index;
-    if (a_i < b_i) {
-      return true;
-    } else if (a_i == b_i) {
-      if (a_j < b_j) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  std::sort(matrix.begin(), matrix.end(), sort_fn);
-
-  f.close();
-
-  return matrix;
-}
-
-template <typename T, typename I, typename Allocator, typename... Args>
-void destroy_csr_matrix_view(dr::sp::csr_matrix_view<T, I, Args...> view,
-                             Allocator &&alloc) {
-  alloc.deallocate(view.values_data(), view.size());
-  typename std::allocator_traits<Allocator>::template rebind_alloc<I> i_alloc(
-      alloc);
-  i_alloc.deallocate(view.colind_data(), view.size());
-  i_alloc.deallocate(view.rowptr_data(), view.shape()[0] + 1);
-}
-
-} // namespace __detail
-
 template <typename T, typename I>
-auto create_distributed(dr::sp::csr_matrix_view<T, I> local_mat,
+auto create_distributed(dr::views::csr_matrix_view<T, I> local_mat,
                         const matrix_partition &partition) {
   dr::sp::sparse_matrix<T, I> a(local_mat.shape(), partition);
 
-  std::vector<dr::sp::csr_matrix_view<T, I>> views;
+  std::vector<dr::views::csr_matrix_view<T, I>> views;
   std::vector<sycl::event> events;
   views.reserve(a.grid_shape()[0] * a.grid_shape()[1]);
 
@@ -242,7 +39,7 @@ auto create_distributed(dr::sp::csr_matrix_view<T, I> local_mat,
       auto submatrix_shape = dr::index<I>(row_bounds[1] - row_bounds[0],
                                           column_bounds[1] - column_bounds[0]);
 
-      auto copied_submat = __detail::convert_to_csr(
+      auto copied_submat = dr::__detail::convert_to_csr(
           local_submat, submatrix_shape, rng::distance(local_submat),
           std::allocator<T>{});
 
@@ -255,24 +52,27 @@ auto create_distributed(dr::sp::csr_matrix_view<T, I> local_mat,
   __detail::wait(events);
 
   for (auto &&view : views) {
-    __detail::destroy_csr_matrix_view(view, std::allocator<T>{});
+    dr::__detail::destroy_csr_matrix_view(view, std::allocator<T>{});
   }
 
   return a;
 }
 
+template <typename T, typename I>
+auto create_distributed(dr::views::csr_matrix_view<T, I> local_mat) {
+  return create_distributed(
+      local_mat, dr::sp::block_cyclic({dr::sp::tile::div, dr::sp::tile::div},
+                                      {dr::sp::nprocs(), 1}));
+}
+
 template <typename T, typename I = std::size_t>
 auto mmread(std::string file_path, const matrix_partition &partition,
             bool one_indexed = true) {
-  auto m = __detail::mmread<T, I>(file_path, one_indexed);
-  auto shape = m.shape();
-  auto nnz = m.size();
-
-  auto local_mat = __detail::convert_to_csr(m, shape, nnz, std::allocator<T>{});
+  auto local_mat = read_csr<T, I>(file_path, one_indexed);
 
   auto a = create_distributed(local_mat, partition);
 
-  __detail::destroy_csr_matrix_view(local_mat, std::allocator<T>{});
+  dr::__detail::destroy_csr_matrix_view(local_mat, std::allocator<T>{});
 
   return a;
 }
diff --git a/include/dr/sp/views/dense_column_view.hpp b/include/dr/sp/views/dense_column_view.hpp
index 627c4faebf..25c2b607ea 100644
--- a/include/dr/sp/views/dense_column_view.hpp
+++ b/include/dr/sp/views/dense_column_view.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <dr/detail/index.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <iterator>
 
 namespace dr::sp {
@@ -17,9 +17,9 @@ template <typename T, typename Iter> class dense_matrix_column_accessor {
   using scalar_value_type = std::iter_value_t<Iter>;
   using scalar_reference = std::iter_reference_t<Iter>;
 
-  using value_type = dr::sp::matrix_entry<scalar_value_type, std::size_t>;
+  using value_type = dr::matrix_entry<scalar_value_type, std::size_t>;
 
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using iterator_category = std::random_access_iterator_tag;
 
diff --git a/include/dr/sp/views/dense_matrix_iterator.hpp b/include/dr/sp/views/dense_matrix_iterator.hpp
index 85c5274357..8c5e9e929a 100644
--- a/include/dr/sp/views/dense_matrix_iterator.hpp
+++ b/include/dr/sp/views/dense_matrix_iterator.hpp
@@ -8,7 +8,7 @@
 
 #include <dr/detail/index.hpp>
 #include <dr/detail/iterator_adaptor.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <dr/sp/views/dense_column_view.hpp>
 #include <dr/sp/views/dense_row_view.hpp>
 
@@ -22,9 +22,9 @@ template <typename T, typename Iter> class dense_matrix_accessor {
   using scalar_type = std::iter_value_t<Iter>;
   using scalar_reference = std::iter_reference_t<Iter>;
 
-  using value_type = dr::sp::matrix_entry<scalar_type, std::size_t>;
+  using value_type = dr::matrix_entry<scalar_type, std::size_t>;
 
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using iterator_category = std::random_access_iterator_tag;
 
diff --git a/include/dr/sp/views/dense_matrix_view.hpp b/include/dr/sp/views/dense_matrix_view.hpp
index bc2bd86d30..f7f2930f2d 100644
--- a/include/dr/sp/views/dense_matrix_view.hpp
+++ b/include/dr/sp/views/dense_matrix_view.hpp
@@ -8,7 +8,7 @@
 
 #include <dr/detail/index.hpp>
 #include <dr/detail/iterator_adaptor.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <dr/sp/containers/sequential/dense_matrix.hpp>
 #include <dr/sp/views/dense_column_view.hpp>
 #include <dr/sp/views/dense_matrix_iterator.hpp>
@@ -24,7 +24,7 @@ class dense_matrix_view
   using difference_type = std::ptrdiff_t;
 
   using scalar_reference = std::iter_reference_t<Iter>;
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using key_type = dr::index<>;
   using map_type = T;
diff --git a/include/dr/sp/views/dense_row_view.hpp b/include/dr/sp/views/dense_row_view.hpp
index d3ccee2c1a..09b008aeee 100644
--- a/include/dr/sp/views/dense_row_view.hpp
+++ b/include/dr/sp/views/dense_row_view.hpp
@@ -6,7 +6,7 @@
 
 #include <dr/detail/index.hpp>
 #include <dr/detail/iterator_adaptor.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <iterator>
 
 namespace dr::sp {
@@ -18,9 +18,9 @@ template <typename T, typename Iter> class dense_matrix_row_accessor {
   using scalar_value_type = std::iter_value_t<Iter>;
   using scalar_reference = std::iter_reference_t<Iter>;
 
-  using value_type = dr::sp::matrix_entry<scalar_value_type, std::size_t>;
+  using value_type = dr::matrix_entry<scalar_value_type, std::size_t>;
 
-  using reference = dr::sp::matrix_ref<T, std::size_t, scalar_reference>;
+  using reference = dr::matrix_ref<T, std::size_t, scalar_reference>;
 
   using iterator_category = std::random_access_iterator_tag;
 
diff --git a/include/dr/sp/views/csr_matrix_view.hpp b/include/dr/views/csr_matrix_view.hpp
similarity index 95%
rename from include/dr/sp/views/csr_matrix_view.hpp
rename to include/dr/views/csr_matrix_view.hpp
index c698cc9c9d..e7bd4ed1f6 100644
--- a/include/dr/sp/views/csr_matrix_view.hpp
+++ b/include/dr/views/csr_matrix_view.hpp
@@ -5,10 +5,10 @@
 #pragma once
 
 #include <dr/detail/index.hpp>
-#include <dr/sp/containers/matrix_entry.hpp>
+#include <dr/detail/matrix_entry.hpp>
 #include <iterator>
 
-namespace dr::sp {
+namespace dr::views {
 
 template <typename T, typename I, typename TIter, typename IIter>
 class csr_matrix_view_accessor {
@@ -21,9 +21,9 @@ class csr_matrix_view_accessor {
 
   using index_type = I;
 
-  using value_type = dr::sp::matrix_entry<scalar_type, I>;
+  using value_type = dr::matrix_entry<scalar_type, I>;
 
-  using reference = dr::sp::matrix_ref<T, I, scalar_reference>;
+  using reference = dr::matrix_ref<T, I, scalar_reference>;
 
   using iterator_category = std::random_access_iterator_tag;
 
@@ -129,7 +129,7 @@ class csr_matrix_view
   using difference_type = std::ptrdiff_t;
 
   using scalar_reference = std::iter_reference_t<TIter>;
-  using reference = dr::sp::matrix_ref<T, I, scalar_reference>;
+  using reference = dr::matrix_ref<T, I, scalar_reference>;
 
   using scalar_type = T;
   using index_type = I;
@@ -138,7 +138,7 @@ class csr_matrix_view
   using map_type = T;
 
   using iterator = csr_matrix_view_iterator<T, I, TIter, IIter>;
-
+  csr_matrix_view() = default;
   csr_matrix_view(TIter values, IIter rowptr, IIter colind, key_type shape,
                   size_type nnz, size_type rank)
       : values_(values), rowptr_(rowptr), colind_(colind), shape_(shape),
@@ -222,4 +222,4 @@ csr_matrix_view(TIter, IIter, IIter, Args &&...)
     -> csr_matrix_view<std::iter_value_t<TIter>, std::iter_value_t<IIter>,
                        TIter, IIter>;
 
-} // namespace dr::sp
+} // namespace dr::views
diff --git a/include/dr/views/transform.hpp b/include/dr/views/transform.hpp
index b99a07ad32..a379f11f90 100644
--- a/include/dr/views/transform.hpp
+++ b/include/dr/views/transform.hpp
@@ -53,7 +53,7 @@ class transform_iterator {
 
   bool operator<(iterator other) const noexcept { return iter_ < other.iter_; }
 
-  bool operator>(iterator other) const noexcept { return iter_ > iter_; }
+  bool operator>(iterator other) const noexcept { return iter_ > other.iter_; }
 
   bool operator<=(iterator other) const noexcept {
     return iter_ <= other.iter_;
diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt
index 32f26d120a..f4f65e4776 100644
--- a/test/gtest/mp/CMakeLists.txt
+++ b/test/gtest/mp/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(
   communicator.cpp
   copy.cpp
   distributed_vector.cpp
+  gemv.cpp
   halo.cpp
   mdstar.cpp
   mpsort.cpp
@@ -43,6 +44,7 @@ add_executable(
   stencil.cpp
   segments.cpp
   slide_view.cpp
+  sparse_matrix.cpp
   wave_kernel.cpp)
 
 add_executable(
diff --git a/test/gtest/mp/broadcasted_vector.cpp b/test/gtest/mp/broadcasted_vector.cpp
new file mode 100644
index 0000000000..a8e27289f0
--- /dev/null
+++ b/test/gtest/mp/broadcasted_vector.cpp
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+
+TEST(BroadcastedVector, BroadcastData) {
+  std::size_t n = 100;
+  auto rank = dr::mp::default_comm().rank();
+  std::vector<int> data(n);
+  if (rank == 0) {
+    for (int i = 0; i < n; i++) {
+      data[i] = i;
+    }
+  }
+  dr::mp::broadcasted_vector<int> broadcasted;
+  if (rank == 0) {
+    broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm());
+  } else {
+    broadcasted.broadcast_data(n, 0, rng::empty_view<int>(),
+                               dr::mp::default_comm());
+  }
+
+  std::vector<int> ref(n);
+  for (int i = 0; i < n; i++) {
+    ref[i] = i;
+  }
+
+  EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(),
+                          broadcasted.broadcasted_data() + n),
+            ref);
+  broadcasted.destroy_data();
+}
+
+TEST(BroadcastedVector, BroadcastDataReuse) {
+  std::size_t n = 100;
+  auto rank = dr::mp::default_comm().rank();
+  std::vector<int> data(n);
+  if (rank == 0) {
+    for (int i = 0; i < n; i++) {
+      data[i] = i;
+    }
+  }
+  dr::mp::broadcasted_vector<int> broadcasted;
+  if (rank == 0) {
+    broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm());
+  } else {
+    broadcasted.broadcast_data(n, 0, rng::empty_view<int>(),
+                               dr::mp::default_comm());
+  }
+
+  std::vector<int> ref(n);
+  for (int i = 0; i < n; i++) {
+    ref[i] = i;
+  }
+
+  EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(),
+                          broadcasted.broadcasted_data() + n),
+            ref);
+  broadcasted.destroy_data();
+  EXPECT_EQ(broadcasted.broadcasted_data(), nullptr);
+  if (rank == 0) {
+    broadcasted.broadcast_data(n, 0, data, dr::mp::default_comm());
+  } else {
+    broadcasted.broadcast_data(n, 0, rng::empty_view<int>(),
+                               dr::mp::default_comm());
+  }
+  EXPECT_EQ(rng::subrange(broadcasted.broadcasted_data(),
+                          broadcasted.broadcasted_data() + n),
+            ref);
+  broadcasted.destroy_data();
+}
diff --git a/test/gtest/mp/gemv.cpp b/test/gtest/mp/gemv.cpp
new file mode 100644
index 0000000000..cf397ad8ed
--- /dev/null
+++ b/test/gtest/mp/gemv.cpp
@@ -0,0 +1,184 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+auto testMatrixGemv(std::size_t m, std::size_t k, auto &a) {
+  std::vector<float> base_b(k, 1.f);
+  std::vector<float> c(m, 0.f);
+
+  dr::mp::broadcasted_vector<float> allocated_b;
+  allocated_b.broadcast_data(k, 0, base_b, dr::mp::default_comm());
+
+  dr::mp::gemv(c, a, allocated_b);
+
+  std::vector<float> c_ref(m, 0.f);
+
+  for (auto &&[index, v] : a) {
+    auto &&[i, k] = index;
+
+    c_ref[i] += v;
+  }
+
+  EXPECT_TRUE(fp_equal(c_ref, c))
+      << fmt::format("Reference:\n  {}\nActual:\n  {}\n", c_ref, c);
+}
+
+auto testMatrixGemm(std::size_t m, std::size_t n, auto &a, std::size_t width) {
+  std::vector<float> base_b(n * width);
+  std::vector<float> c(m * width, 0.f);
+
+  for (auto i = 0; i < n * width; i++) {
+    base_b[i] = i;
+  }
+
+  dr::mp::broadcasted_slim_matrix<float> allocated_b;
+  allocated_b.broadcast_data(n, width, 0, base_b, dr::mp::default_comm());
+
+  dr::mp::gemv(c, a, allocated_b);
+
+  std::vector<float> c_ref(m * width, 0.f);
+
+  for (auto &&[index, v] : a) {
+    auto &&[i, k] = index;
+
+    for (auto j = 0; j < width; j++) {
+      c_ref[i + j * m] += v * base_b[k + j * n];
+    }
+  }
+
+  EXPECT_TRUE(fp_equal(c_ref, c))
+      << fmt::format("Reference:\n  {}\nActual:\n  {}\n", c_ref, c);
+}
+
+TEST(SparseMatrix, GemvRow) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemvEq) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemvRowNotSquare) {
+  std::size_t m = 1000;
+  std::size_t k = 10;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemvEqNotSquare) {
+  std::size_t m = 1000;
+  std::size_t k = 10;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemvRowNotSquareDifferentAxis) {
+  std::size_t m = 10;
+  std::size_t k = 1000;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemvEqNotSquareDifferentAxis) {
+  std::size_t m = 10;
+  std::size_t k = 1000;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemv(m, k, a);
+}
+
+TEST(SparseMatrix, GemmRow) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
+
+TEST(SparseMatrix, GemmEq) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
+
+TEST(SparseMatrix, GemmRowNotSquare) {
+  std::size_t m = 1000;
+  std::size_t k = 10;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
+
+TEST(SparseMatrix, GemmEqNotSquare) {
+  std::size_t m = 1000;
+  std::size_t k = 10;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
+
+TEST(SparseMatrix, GemmRowNotSquareDifferentAxis) {
+  std::size_t m = 10;
+  std::size_t k = 1000;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
+
+TEST(SparseMatrix, GemmEqNotSquareDifferentAxis) {
+  std::size_t m = 10;
+  std::size_t k = 1000;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixGemm(m, k, a, 20);
+}
diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp
index e663188bbd..c7a00d7323 100644
--- a/test/gtest/mp/reduce.cpp
+++ b/test/gtest/mp/reduce.cpp
@@ -38,16 +38,15 @@ TYPED_TEST(ReduceMP, RootIterators) {
   }
 }
 
-// Example of code that should be compiling, but does not, described in issue
-// DRA-192 TYPED_TEST(ReduceMP, NotCompiling) {
-//   dr::mp::distributed_vector<int> r1(10);
-
-//   auto add = [](auto &&elem) {
-//     return elem + 1;
-//   };
-
-//   auto added = dr::mp::views::transform(r1, add);
-//   auto min = [](double x, double y) { return std::min(x, y); };
-//   auto result = dr::mp::reduce(root, added, 1, min);
-//   EXPECT_EQ(result, 1);
-// }
+TYPED_TEST(ReduceMP, TransformReduce) {
+  Ops1<TypeParam> ops(10);
+
+  auto add = [](auto &&elem) { return elem + 1; };
+
+  auto added = dr::mp::views::transform(ops.dist_vec, add);
+  auto min = [](double x, double y) { return std::min(x, y); };
+  auto result = dr::mp::reduce(root, added, 1, min);
+  if (comm_rank == root) {
+    EXPECT_EQ(result, 1);
+  }
+}
diff --git a/test/gtest/mp/sparse_matrix.cpp b/test/gtest/mp/sparse_matrix.cpp
new file mode 100644
index 0000000000..0f98b0cb48
--- /dev/null
+++ b/test/gtest/mp/sparse_matrix.cpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+auto testMatrixIter(auto &src, auto &matrix) {
+  EXPECT_TRUE(src.size() == matrix.size());
+  std::map<std::pair<long, long>, double> entries;
+  for (auto [index, val] : src) {
+    entries[{index.first, index.second}] = val;
+  }
+  for (auto [index, val] : matrix) {
+    EXPECT_TRUE((val == entries[{index.first, index.second}]));
+  }
+}
+
+auto testMatrixReduce(auto &src, auto &matrix) {
+  EXPECT_TRUE(src.size() == matrix.size());
+  long sum = 0;
+  for (auto [index, val] : src) {
+    auto [x, y] = index;
+    sum += (long)(val + x + y);
+  }
+  auto transformer = [](auto entry) {
+    auto [index, val] = entry;
+    auto [x, y] = index;
+    return (long)(val + x + y);
+  };
+  auto transformed = dr::transform_view(matrix, transformer);
+  long reduced = dr::mp::reduce(transformed, 0, std::plus<long>{});
+  EXPECT_TRUE((sum == reduced));
+}
+
+TEST(SparseMatrix, staticAssertEq) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  using Dist =
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>;
+  static_assert(dr::mp::matrix_distibution<Dist>);
+  static_assert(dr::mp::vector_multiplicable<Dist>);
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<float, unsigned long, dr::mp::MpiBackend> a(
+      csr, 0);
+  static_assert(std::forward_iterator<decltype(a.begin())>);
+  static_assert(std::forward_iterator<decltype(a.end())>);
+  static_assert(std::forward_iterator<decltype(rng::begin(a))>);
+  static_assert(std::forward_iterator<decltype(rng::end(a))>);
+  using Matrix = decltype(a);
+  static_assert(rng::forward_range<Matrix>);
+  static_assert(dr::distributed_range<Matrix>);
+}
+
+TEST(SparseMatrix, staticAssertRow) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  using Dist =
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>;
+  static_assert(dr::mp::matrix_distibution<Dist>);
+  static_assert(dr::mp::vector_multiplicable<Dist>);
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<float, unsigned long, dr::mp::MpiBackend> a(
+      csr, 0);
+  static_assert(std::forward_iterator<decltype(a.begin())>);
+  static_assert(std::forward_iterator<decltype(a.end())>);
+  static_assert(std::forward_iterator<decltype(rng::begin(a))>);
+  static_assert(std::forward_iterator<decltype(rng::end(a))>);
+  using Matrix = decltype(a);
+  static_assert(rng::forward_range<Matrix>);
+  static_assert(dr::distributed_range<Matrix>);
+}
+
+TEST(SparseMatrix, IterRow) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixIter(csr, a);
+}
+
+TEST(SparseMatrix, IterEq) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixIter(csr, a);
+}
+
+TEST(SparseMatrix, ReduceRow) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_row_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixReduce(csr, a);
+}
+
+TEST(SparseMatrix, ReduceEq) {
+  std::size_t m = 100;
+  std::size_t k = 100;
+  auto csr = dr::generate_random_csr({m, k}, 0.1f);
+  dr::mp::distributed_sparse_matrix<
+      float, unsigned long, dr::mp::MpiBackend,
+      dr::mp::csr_eq_distribution<float, unsigned long, dr::mp::MpiBackend>>
+      a(csr, 0);
+  testMatrixReduce(csr, a);
+}
diff --git a/test/gtest/sp/gemv.cpp b/test/gtest/sp/gemv.cpp
index 2b207be926..f573d6bc0e 100644
--- a/test/gtest/sp/gemv.cpp
+++ b/test/gtest/sp/gemv.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "dr/detail/coo_matrix.hpp"
 #include "xp-tests.hpp"
-
 TEST(SparseMatrix, Gemv) {
-  std::size_t m = 100;
-  std::size_t k = 100;
+  long m = 100;
+  long k = 100;
 
   dr::sp::sparse_matrix<float> a(
       {m, k}, 0.1f,
@@ -40,9 +40,9 @@ TEST(SparseMatrix, EmptyGemv) {
   using T = float;
   using I = int;
 
-  dr::sp::__detail::coo_matrix<T, I> base;
-  auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(),
-                                              std::allocator<T>{});
+  dr::__detail::coo_matrix<T, I> base;
+  auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(),
+                                          std::allocator<T>{});
   dr::sp::sparse_matrix<T, I> a =
       dr::sp::create_distributed(csr, dr::sp::row_cyclic());
 
@@ -73,8 +73,8 @@ TEST(SparseMatrix, ZeroVector) {
     }
   }
 
-  auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(),
-                                              std::allocator<T>{});
+  auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(),
+                                          std::allocator<T>{});
   dr::sp::sparse_matrix<T, I> a =
       dr::sp::create_distributed(csr, dr::sp::row_cyclic());
 
@@ -94,8 +94,8 @@ TEST(SparseMatrix, ZeroVector) {
 }
 
 TEST(SparseMatrix, NotSquareMatrix) {
-  std::size_t m = 10;
-  std::size_t k = 1000;
+  long m = 10;
+  long k = 1000;
 
   dr::sp::sparse_matrix<float> a(
       {m, k}, 0.1f,
@@ -124,8 +124,8 @@ TEST(SparseMatrix, NotSquareMatrix) {
 }
 
 TEST(SparseMatrix, NotSquareMatrixOtherAxis) {
-  std::size_t m = 1000;
-  std::size_t k = 10;
+  long m = 1000;
+  long k = 10;
 
   dr::sp::sparse_matrix<float> a(
       {m, k}, 0.1f,
@@ -154,8 +154,8 @@ TEST(SparseMatrix, NotSquareMatrixOtherAxis) {
 }
 
 TEST(SparseMatrix, VerySparseMatrix) {
-  std::size_t m = 100;
-  std::size_t k = 100;
+  long m = 100;
+  long k = 100;
 
   dr::sp::sparse_matrix<float> a(
       {m, k}, 0.001f,
diff --git a/test/gtest/sp/sparse.cpp b/test/gtest/sp/sparse.cpp
index 2e30fee2a0..bf0f1d7b17 100644
--- a/test/gtest/sp/sparse.cpp
+++ b/test/gtest/sp/sparse.cpp
@@ -16,8 +16,8 @@ TEST(SparseMatrix, IterationForward) {
   }
   std::vector<std::pair<std::pair<I, I>, T>> reference(base.size());
   std::copy(base.begin(), base.end(), reference.begin());
-  auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(),
-                                              std::allocator<T>{});
+  auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(),
+                                          std::allocator<T>{});
   dr::sp::sparse_matrix<T, I> a =
       dr::sp::create_distributed(csr, dr::sp::row_cyclic());
   int i = 0;
@@ -48,8 +48,8 @@ TEST(SparseMatrix, IterationReverse) {
   }
   std::vector<std::pair<std::pair<I, I>, T>> reference(base.size());
   std::copy(base.begin(), base.end(), reference.begin());
-  auto csr = dr::sp::__detail::convert_to_csr(base, {m, k}, base.size(),
-                                              std::allocator<T>{});
+  auto csr = dr::__detail::convert_to_csr(base, {m, k}, base.size(),
+                                          std::allocator<T>{});
   dr::sp::sparse_matrix<T, I> a =
       dr::sp::create_distributed(csr, dr::sp::row_cyclic());
   int i = base.size();