diff --git a/include/graphblas/algorithms/hpcg/average_coarsener.hpp b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
new file mode 100644
index 000000000..41abed9e2
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/average_coarsener.hpp
@@ -0,0 +1,412 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file average_coarsener.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to build the coarsening matrix for an HPCG simulation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+#define _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+#include <stdexcept>
+
+#include <graphblas/utils/multigrid/array_vector_storage.hpp>
+#include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
+
+
+namespace grb {
+
+	namespace algorithms {
+
+		namespace hpcg {
+
+			// forward declaration
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			class AverageCoarsenerBuilder;
+
+			/**
+			 * Iterator class to generate the coarsening matrix that averages over the
+			 * elements of the finer domain corresponding to the element of the coarser
+			 * domain.
+			 *
+			 * The coarsening matrix averages \b all elements that are coarsened into
+			 * one.
+			 *
+			 * This coarsening method requires some computation but should be relatively
+			 * robust to noise or to partitioning strategies that parallelize the
+			 * smoother (usually run before coarsening).
+			 *
+			 * This iterator is random-access.
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType type storing the coordinates and the sizes
+			 * @tparam ValueType type of the nonzero: it must be able to represent 1 /
+			 *                   <number of finer elements per coarser elements>
+			 */
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			struct AverageGeneratorIterator {
+
+				friend AverageCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+				/** Numeric type of rows */
+				typedef CoordType RowIndexType;
+
+				/** Numeric type of columns */
+				typedef CoordType ColumnIndexType;
+
+				typedef typename grb::utils::multigrid::LinearizedNDimSystem<
+						CoordType,
+						grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+					> LinearSystemType;
+
+				typedef typename LinearSystemType::Iterator LinearSystemIterType;
+
+				typedef AverageGeneratorIterator< DIMS, CoordType, ValueType > SelfType;
+
+				typedef std::array< CoordType, DIMS > ArrayType;
+
+				class ValueGenerator {
+
+					friend SelfType;
+
+
+					private:
+
+						RowIndexType _i;
+
+						ColumnIndexType _j;
+
+						ValueType _value;
+
+
+					public:
+
+						ValueGenerator(
+							RowIndexType i,
+							ColumnIndexType j,
+							ValueType value
+						) noexcept :
+							_i( i ),
+							_j( j ),
+							_value( value )
+						{}
+
+						ValueGenerator( const ValueGenerator & ) = default;
+
+						ValueGenerator & operator=( const ValueGenerator & ) = default;
+
+						inline RowIndexType i() const {
+							return _i;
+						}
+
+						inline ColumnIndexType j() const {
+							return _j;
+						}
+
+						inline ValueType v() const {
+							return _value;
+						}
+
+				};
+
+				// interface for std::random_access_iterator
+				typedef std::random_access_iterator_tag iterator_category;
+
+				typedef ValueGenerator value_type;
+
+				typedef const value_type pointer;
+
+				typedef const value_type & reference;
+
+				typedef typename LinearSystemIterType::difference_type difference_type;
+
+				AverageGeneratorIterator( const SelfType &o ) = default;
+
+				AverageGeneratorIterator( SelfType && ) = default;
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				SelfType & operator=( SelfType && ) = default;
+
+				/**
+				 * Advances \c this by 1 in constant time.
+				 */
+				SelfType & operator++() noexcept {
+					(void) ++_subspace_iter;
+					size_t subspace_position = _subspace_iter->get_linear_position();
+					if( subspace_position == _num_neighbors ) {
+						(void) ++_sys_iter;
+						_subspace_iter = _finer_subspace->begin();
+					}
+					update_coords();
+					return *this;
+				}
+
+				/**
+				 * Advances \c this by \p offset in constant time.
+				 */
+				SelfType & operator+=( size_t offset ) {
+					CoordType sub_offset = _subspace_iter->get_linear_position() + offset;
+					std::ldiv_t res = std::ldiv( sub_offset, _num_neighbors );
+					_sys_iter += res.quot;
+					_subspace_iter = _finer_subspace->begin();
+					_subspace_iter += res.rem;
+					update_coords();
+					return *this;
+				}
+
+				/**
+				 * Computes the difference between \c this and \p o as integer.
+				 */
+				difference_type operator-( const SelfType &o ) const {
+					return this->_sys_iter - o._sys_iter;
+				}
+
+				/**
+				 * Returns whether \c this and \p o differ.
+				 */
+				bool operator!=( const SelfType &o ) const {
+					return this->_sys_iter != o._sys_iter;
+				}
+
+				/**
+				 * Returns whether \c this and \p o are equal.
+				 */
+				bool operator==( const SelfType &o ) const {
+					return ! this->operator!=( o );
+				}
+
+				reference operator*() const {
+					return _val;
+				}
+
+				pointer operator->() const {
+					return &_val;
+				}
+
+				/**
+				 * Returns the current row, within the coarser system.
+				 */
+				inline RowIndexType i() const {
+					return _val.i();
+				}
+
+				/**
+				 * Returns the current column, within the finer system.
+				 */
+				inline ColumnIndexType j() const {
+					return _val.j();
+				}
+
+				/**
+				 * Returns always 1, as the coarsening keeps the same value.
+				 */
+				inline ValueType v() const {
+					return _val.v();
+				}
+
+
+			private:
+
+				const LinearSystemType * _lin_sys;
+				const LinearSystemType * _finer_subspace;
+				const ArrayType * _steps;
+				CoordType _num_neighbors;
+				LinearSystemIterType _sys_iter;
+				LinearSystemIterType _subspace_iter;
+				value_type _val;
+
+				/**
+				 * Construct a new AverageGeneratorIterator object starting from the
+				 * LinearizedNDimSystem object \p system describing the \b coarser system
+				 * and the \b ratios \p steps between each finer and the corresponding
+				 * coarser dimension.
+				 *
+				 * @param system LinearizedNDimSystem object describing the coarser system
+				 * @param finer_subspace LinearizedNDimSystem object describing the subspace
+				 *                       of each element in the finer system
+				 * @param steps Ratios per dimension between finer and coarser system
+				 */
+				AverageGeneratorIterator(
+					const LinearSystemType &system,
+					const LinearSystemType &finer_subspace,
+					const ArrayType &steps
+				) noexcept :
+					_lin_sys( &system ),
+					_finer_subspace( &finer_subspace ),
+					_steps( &steps ),
+					_num_neighbors( std::accumulate( steps.cbegin(), steps.cend(), 1UL,
+						std::multiplies< CoordType >() ) ),
+					_sys_iter( system.begin() ),
+					_subspace_iter( finer_subspace.begin() ),
+					_val( 0, 0, static_cast< ValueType >( 1 ) /
+						static_cast< ValueType >( _num_neighbors ) )
+				{
+					update_coords();
+				}
+
+				void update_coords() noexcept {
+					_val._i = _sys_iter->get_linear_position();
+					_val._j = coarse_rows_to_finer_col();
+				}
+
+				/**
+				 * Returns the row coordinates converted to the finer system, to compute
+				 * the column value.
+				 */
+				ColumnIndexType coarse_rows_to_finer_col() const noexcept {
+					ColumnIndexType finer = 0;
+					ColumnIndexType s = 1;
+					for( size_t i = 0; i < DIMS; i++ ) {
+						finer += s * _subspace_iter->get_position()[ i ];
+						s *= ( *_steps )[ i ];
+						finer += s * _sys_iter->get_position()[ i ];
+						s *= _lin_sys->get_sizes()[ i ];
+					}
+					return finer;
+				}
+
+			};
+
+			/**
+			 * Builder object to create iterators that generate an averaging-coarsening
+			 * matrix.
+			 *
+			 * It is a facility to generate beginning and end iterators and abstract the
+			 * logic away from users.
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType type storing the coordinates and the sizes
+			 * @tparam ValueType type of the nonzero: it must be able to represent 1
+			 *                   (the value to sample the finer value)
+			 */
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType
+			>
+			class AverageCoarsenerBuilder {
+
+				public:
+
+					typedef std::array< CoordType, DIMS > ArrayType;
+					typedef AverageGeneratorIterator< DIMS, CoordType, ValueType > Iterator;
+					typedef AverageCoarsenerBuilder< DIMS, CoordType, ValueType > SelfType;
+
+					/**
+					 * Construct a new AverageCoarsenerBuilder object from the sizes of finer
+					 * system and those of the coarser system; finer sizes must be an exact
+					 * multiple of coarser sizes, otherwise an exception is raised.
+					 */
+					AverageCoarsenerBuilder(
+						const ArrayType &_finer_sizes,
+						const ArrayType &_coarser_sizes
+					) :
+						system( _coarser_sizes.begin(), _coarser_sizes.end() ),
+						_finer_subspace( _coarser_sizes.cbegin(), _coarser_sizes.cend() ),
+						steps( DIMS )
+					{
+						for( size_t i = 0; i < DIMS; i++ ) {
+							// finer size MUST be an exact multiple of coarser_size
+							std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
+							if( ratio.quot < 2 || ratio.rem != 0 ) {
+								throw std::invalid_argument(
+									std::string( "finer size of dimension " ) + std::to_string( i ) +
+									std::string( "is not an exact multiple of coarser size" ) );
+							}
+							steps[ i ] = ratio.quot;
+						}
+						_finer_subspace.retarget( steps );
+					}
+
+				AverageCoarsenerBuilder( const SelfType & ) = delete;
+
+				AverageCoarsenerBuilder( SelfType && ) = delete;
+
+				SelfType & operator=( const SelfType & ) = delete;
+
+				SelfType & operator=( SelfType && ) = delete;
+
+				/**
+				 * Returns the size of the finer system, i.e. its number of elements.
+				 */
+				size_t system_size() const {
+					return system.system_size();
+				}
+
+				/**
+				 * Produces a beginning iterator to generate the coarsening matrix.
+				 */
+				Iterator make_begin_iterator() {
+					return Iterator( system, _finer_subspace, steps );
+				}
+
+				/**
+				 * Produces an end iterator to stop the generation of the coarsening
+				 * matrix.
+				 */
+				Iterator make_end_iterator() {
+					Iterator result( system, _finer_subspace, steps );
+					// do not trigger boundary checks
+					result += ( system_size() * _finer_subspace.system_size() );
+					return result;
+				}
+
+
+			private:
+
+				const grb::utils::multigrid::LinearizedNDimSystem<
+					CoordType,
+					grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+				> system;
+
+				grb::utils::multigrid::LinearizedNDimSystem<
+					CoordType,
+					grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType >
+				> _finer_subspace;
+
+				/**
+				 * Array of steps, i.e. how much each column coordinate (finer system) must
+				 * be incremented when incrementing the row coordinates; it is the ratio
+				 * between #finer_sizes and row_generator#physical_sizes
+				 */
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > steps;
+			};
+
+		} // namespace internal
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_AVERAGE_COARSENER
+
diff --git a/include/graphblas/algorithms/hpcg/greedy_coloring.hpp b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
new file mode 100644
index 000000000..366465c41
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/greedy_coloring.hpp
@@ -0,0 +1,191 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file greedy_coloring.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to partition the elements of a mesh via a simple, greedy coloring algorithm.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
+#define _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
+
+#include <cstddef>
+#include <vector>
+
+#include <graphblas/utils/multigrid/linearized_halo_ndim_system.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Coloring algorithm for matrix generated by a \p DIMS - dimensional system.
+		 *
+		 * This function implements a < b>greedy heuristics< /b> to color the rows of a matrix generated by
+		 * a \p DIMS - dimensional generator \p system, so that no two connected elements \a i,j
+		 * in the system (corresponding to a nonzero \a (i,j) entry in the matrix) have the same color.
+		 * If \p reorder_rows_per_color is false (as per default), the coloring information is stored into
+		 * \p row_colors, while \p color_counters stores the number of rows for each color.
+		 *
+		 * If \p reorder_rows_per_color is true, the function performs the additional step of \b re-ordering
+		 * the rows depending on their color: rows of color \a 0 are moved first, then rows of color \a 1
+		 * are moved to the following positions and so on. In this case, \p row_colors stores the new row number
+		 * while \p color_counters stores at each position \a i the new position of the first row of color \a i.
+		 *
+		 * In both cases, \a color_counters.size() gives the number of found colors.
+		 *
+		 * This algorithm performs a \a global coloring of the input system, i.e. it must run on the entire system
+		 * \a before any partitioning occurs. Although this is not scalable, it should not be a problem for
+		 * most sizes, as the constants in front of this algorithms are very small. Implementing a distributed
+		 * coloring algorithm is anyway out of the scope of this prototype.
+		 *
+		 * Colors are by default assigned in a greedy way from the lowest one up, making this coloring scheme very
+		 * regular: close elements tend to have similar colors. This can be changed with \p lower_color_first
+		 * \c = \c false , which assigns colors from the highest one. This may avoid "destructive interference"
+		 * with following coarsening schemes.
+		 *
+		 * @tparam DIMS dimensions of the system
+		 * @tparam CoordType type of the coordinates
+		 * @tparam lower_color_first start greedy assignment of colors from lowest first
+		 *
+		 * @param[in] system generator for an \p DIMS - dimesional system with halo
+		 * @param[out] row_colors if \p reorder_rows_per_color is false, stores the color of each row;
+		 * 	if \p reorder_rows_per_color is true, stores the new position of each row, so that rows
+		 * 	of the same color are grouped together; the initial content of the vector is destroyed
+		 * @param[out] color_counters if \p reorder_rows_per_color is false, stores the number of rows per color;
+		 * 	if \p reorder_rows_per_color is true, stores at each position \a i the offset in \p color_counters
+		 * 	where the (clustered) rows of color \a i start from; the initial content of the vector is destroyed
+		 * @param[in] reorder_rows_per_color whether to do the clustering after the coloring
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			bool lowest_color_first = true
+		> void hpcg_greedy_color_ndim_system(
+			const grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType > & system,
+			std::vector< CoordType > & row_colors,
+			std::vector< CoordType > & color_counters,
+			bool reorder_rows_per_color = false
+		) {
+			CoordType nrows = system.system_size();
+			// value `nrows' means `uninitialized'; initialized colors go from 0 to nrow-1
+			row_colors.insert( row_colors.begin(), nrows, nrows );
+			CoordType totalColors = 1;
+			row_colors[ 0 ] = 0; // first point gets color 0
+
+			// Finds colors in a greedy (a likely non-optimal) fashion.
+			typename grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >::Iterator begin = system.begin();
+			begin.next_element(); // skip first row
+
+			std::vector< bool > assigned( totalColors );
+			while( begin.has_more_elements() ) {
+				CoordType curRow = begin->get_element_linear();
+
+				if( row_colors[ curRow ] != nrows ) {
+					// if color already assigned to curRow
+					continue;
+				}
+				assigned.assign( totalColors, false );
+				CoordType currentlyAssigned = 0;
+
+				while( begin.has_more_neighbours() ) {
+					CoordType curCol = begin->get_neighbor_linear();
+					if( curCol < curRow ) {
+						assert( row_colors[ curCol ] < nrows ); // if curCol < curRow, curCol has already a color assigned
+						std::vector< bool >::reference color_is_assigned = assigned[ row_colors[ curCol ] ];
+						if( ! color_is_assigned ) {
+							// count how many colors are already assigned
+							(void)currentlyAssigned++;
+						}
+						// track which colors are assigned
+						color_is_assigned = true;
+					} // else // could take advantage of indices being sorted
+					begin.next_neighbour();
+				}
+
+				if( currentlyAssigned < totalColors ) {
+					// if there is at least one color left to use, look for it
+					// smallest possible
+					if( lowest_color_first ) {
+						// here, assign colors greedily starting from the lowest available one
+						for( CoordType j = 0; j < totalColors; ++j ) {
+							if( ! assigned[ j ] ) {
+								// if no neighbor with this color, use it for this row
+								row_colors[ curRow ] = j;
+								break;
+							}
+						}
+					} else {
+						// here, assign colors greedily starting from the highest available one
+						for( CoordType j = totalColors; j > 0; --j ) {
+							CoordType color = j - 1;
+							if( ! assigned[ color ] ) {
+								// if no neighbor with this color, use it for this row
+								row_colors[ curRow ] = color;
+								break;
+							}
+						}
+					}
+				} else {
+					assert( row_colors[ curRow ] == nrows );
+					if( row_colors[ curRow ] == nrows ) {
+						row_colors[ curRow ] = totalColors;
+						(void)totalColors++;
+					} else {
+						assert( 0 ); // should never get here
+					}
+				}
+				begin.next_element();
+			}
+
+#ifdef _DEBUG
+			std::cout << "assigned colors: " << totalColors << " [ <row> -> <color>]\n";
+			for( size_t i = 0; i < row_colors.size(); i++ ) {
+				std::cout << i << " -> " << row_colors[ i ] << ", ";
+			}
+			std::cout << std::endl;
+#endif
+
+			// count number of vertices per color
+			color_counters.insert( color_counters.begin(), totalColors, 0 );
+			for( CoordType i = 0; i < nrows; ++i ) {
+				(void)color_counters[ row_colors[ i ] ]++;
+			}
+
+			if( ! reorder_rows_per_color ) {
+				return;
+			}
+
+			// form in-place prefix scan
+			CoordType old = 0, old0;
+			for( CoordType i = 1; i < totalColors; ++i ) {
+				old0 = color_counters[ i ];
+				color_counters[ i ] = color_counters[ i - 1 ] + old;
+				old = old0;
+			}
+			color_counters[ 0 ] = 0;
+
+			// translate `colors' into a permutation
+			for( CoordType i = 0; i < nrows; ++i ) {
+				row_colors[ i ] = color_counters[ row_colors[ i ] ]++;
+			}
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_GREEDY_COLORING
diff --git a/include/graphblas/algorithms/hpcg/hpcg.hpp b/include/graphblas/algorithms/hpcg/hpcg.hpp
deleted file mode 100644
index 6caf22a1c..000000000
--- a/include/graphblas/algorithms/hpcg/hpcg.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief File with the main routine to run a full HPCG simulation, comprising multi-grid runs
- *        with Red-Black Gauss-Seidel smoothing.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_HPCG
-#define _H_GRB_ALGORITHMS_HPCG
-
-#include <graphblas.hpp>
-
-#include "hpcg_data.hpp"
-#include "multigrid_v_cycle.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-
-		/**
-		 * @brief High-Performance Conjugate Gradient algorithm implementation running entirely on GraphBLAS.
-		 *
-		 * Finds the solution x of an \f$ A x = b \f$ algebraic system by running the HPCG algorithm.
-		 * The implementation here closely follows the reference HPCG benchmark used for the HPCG500 rank,
-		 * visible at https://github.com/hpcg-benchmark/hpcg.
-		 * The only difference is the usage of a Red-Black Gauss-Seidel smoother instead of the standard one
-		 * for performance reasons, as the standard Gauss-Seidel algorithm is inherently sequential and not
-		 * expressible in terms of standard linear algebra operations.
-		 * In particular, this implementation (as the standard one) couples a standard CG algorithm with a V-cycle
-		 * multi-grid solver to initially refine the tentative solution. This refinement step depends on the
-		 * availability of coarsening information, which should be stored inside \p data; otherwise,
-		 * the refinement is not performed and only the CG algorithm is run. For more information on inputs
-		 * and on coarsening information, you may consult the \ref hpcg_data class documentation.
-		 *
-		 * This implementation assumes that the vectors and matrices inside \p data are all correctly initialized
-		 * and populated with the proper values; in particular
-		 * - hpcg_data#x with the initial tentative solution (iterative solutions are also stored here)
-		 * - hpcg_data#A with the system matrix
-		 * - hpcg_data#b with the right-hand side vector \f$ b \f$
-		 * - hpcg_data#A_diagonal with the diagonal values of the matrix
-		 * - hpcg_data#color_masks with the color masks for this level
-		 * - hpcg_data#coarser_level with the information for the coarser multi-grid run (if any)
-		 * The other vectors are assumed to be inizialized (via the usual grb::Vector#Vector(size_t) constructor)
-		 * but not necessarily populated with values, as they are internally populated when needed; hence,
-		 * any previous values are overwritten.
-		 *
-		 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-		 * the failure code.
-		 *
-		 * @tparam IOType type of result and intermediate vectors used during computation
-		 * @tparam ResidualType type of the residual norm
-		 * @tparam NonzeroType type of matrix values
-		 * @tparam InputType type of values of the right-hand side vector b
-		 * @tparam Ring the ring of algebraic operators zero-values
-		 * @tparam Minus the minus operator for subtractions
-		 *
-		 * @param[in,out] data \ref hpcg_data object storing inputs, outputs and temporary vectors used for the computation,
-		 *                     as long as the information for the recursive multi-grid runs
-		 * @param[in] with_preconditioning whether to use pre-conditioning, i.e. to perform multi-grid runs
-		 * @param[in] presmoother_steps number of pre-smoother steps, for multi-grid runs
-		 * @param[in] postsmoother_steps nomber of post-smoother steps, for multi-grid runs
-		 * @param[in] max_iterations maximum number if iterations the simulation may run for; once reached,
-		 *                           the simulation stops even if the residual norm is above \p tolerance
-		 * @param[in] tolerance the tolerance over the residual norm, i.e. the value of the residual norm to stop
-		 *                      the simulation at
-		 * @param[out] iterations numbers of iterations performed
-		 * @param[out] norm_residual norm of the final residual
-		 * @param[in] ring the ring to perform the operations on
-		 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-		 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-		 *                          unsuccessful operation otherwise
-		 */
-		template< typename IOType,
-			typename ResidualType,
-			typename NonzeroType,
-			typename InputType,
-			class Ring = Semiring< grb::operators::add< IOType >, grb::operators::mul< IOType >, grb::identities::zero, grb::identities::one >,
-			class Minus = operators::subtract< IOType > >
-		grb::RC hpcg( hpcg_data< IOType, NonzeroType, InputType > &data,
-			bool with_preconditioning,
-			const size_t presmoother_steps,
-			const size_t postsmoother_steps,
-			const size_t max_iterations,
-			const ResidualType tolerance,
-			size_t &iterations,
-			ResidualType &norm_residual,
-			const Ring &ring = Ring(),
-			const Minus &minus = Minus()
-		) {
-			ResidualType alpha;
-
-			const grb::Matrix< NonzeroType > &A { data.A };
-			grb::Vector< IOType > &x { data.x };
-			const grb::Vector< InputType > &b { data.b };
-			grb::Vector< IOType > &r { data.r };  // residual vector
-			grb::Vector< IOType > &p { data.p };  // direction vector
-			grb::Vector< IOType > &Ap { data.u }; // temp vector
-			grb::Vector< IOType > &z { data.z };  // pre-conditioned residual vector
-			grb::RC ret { SUCCESS };
-
-			ret = ret ? ret : grb::set( Ap, 0 );
-			ret = ret ? ret : grb::set( r, 0 );
-			ret = ret ? ret : grb::set( p, 0 );
-
-			ret = ret ? ret : grb::set( p, x );
-			ret = ret ? ret : grb::mxv( Ap, A, x, ring ); // Ap = A * x
-			assert( ret == SUCCESS );
-
-			ret = ret ? ret : grb::eWiseApply( r, b, Ap, minus ); // r = b - Ap;
-			assert( ret == SUCCESS );
-
-			norm_residual = ring.template getZero< ResidualType >();
-			ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // norm_residual = r' * r;
-			assert( ret == SUCCESS );
-
-			// compute sqrt to avoid underflow
-			norm_residual = std::sqrt( norm_residual );
-
-			// initial norm of residual
-			const ResidualType norm_residual_initial { norm_residual };
-			ResidualType old_r_dot_z { 0.0 }, r_dot_z { 0.0 }, beta { 0.0 };
-			size_t iter { 0 };
-
-#ifdef HPCG_PRINT_STEPS
-			DBG_print_norm( p, "start p" );
-			DBG_print_norm( Ap, "start Ap" );
-			DBG_print_norm( r, "start r" );
-#endif
-
-			do {
-#ifdef HPCG_PRINT_STEPS
-				DBG_println( "========= iteration " << iter << " =========" );
-#endif
-				if( with_preconditioning ) {
-					ret = ret ? ret : internal::multi_grid( data, data.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
-					assert( ret == SUCCESS );
-				} else {
-					ret = ret ? ret : grb::set( z, r ); // z = r;
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( z, "initial z" );
-#endif
-
-				ResidualType pAp;
-
-				if( iter == 0 ) {
-					ret = ret ? ret : grb::set( p, z ); //  p = z;
-					assert( ret == SUCCESS );
-
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
-					assert( ret == SUCCESS );
-				} else {
-					old_r_dot_z = r_dot_z;
-
-					r_dot_z = ring.template getZero< ResidualType >();
-					ret = ret ? ret : grb::dot( r_dot_z, r, z, ring ); // r_dot_z = r' * z;
-					assert( ret == SUCCESS );
-
-					beta = r_dot_z / old_r_dot_z;
-					ret = ret ? ret : grb::clear( Ap );                         // Ap  = 0;
-					ret = ret ? ret : grb::eWiseMulAdd( Ap, beta, p, z, ring ); // Ap += beta * p + z;
-					std::swap( Ap, p );                                         // p = Ap;
-					assert( ret == SUCCESS );
-				}
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( p, "middle p" );
-#endif
-
-				ret = ret ? ret : grb::set( Ap, 0 );
-				ret = ret ? ret : grb::mxv( Ap, A, p, ring ); // Ap = A * p;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( Ap, "middle Ap" );
-#endif
-				pAp = static_cast< ResidualType >( 0.0 );
-				ret = ret ? ret : grb::dot( pAp, Ap, p, ring ); // pAp = p' * Ap
-				assert( ret == SUCCESS );
-
-				alpha = r_dot_z / pAp;
-
-				ret = ret ? ret : grb::eWiseMul( x, alpha, p, ring ); // x += alpha * p;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( x, "end x" );
-#endif
-
-				ret = ret ? ret : grb::eWiseMul( r, -alpha, Ap, ring ); // r += - alpha * Ap;
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( r, "end r" );
-#endif
-
-				norm_residual = static_cast< ResidualType >( 0.0 );
-				ret = ret ? ret : grb::dot( norm_residual, r, r, ring ); // residual = r' * r;
-				assert( ret == SUCCESS );
-
-				norm_residual = std::sqrt( norm_residual );
-
-				++iter;
-			} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance && ret == SUCCESS );
-
-			iterations = iter;
-			return ret;
-		}
-
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_HPCG
diff --git a/include/graphblas/algorithms/hpcg/hpcg_data.hpp b/include/graphblas/algorithms/hpcg/hpcg_data.hpp
deleted file mode 100644
index 96b39856d..000000000
--- a/include/graphblas/algorithms/hpcg/hpcg_data.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg_data.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Data structures to store HPCG input/output data.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_HPCG_DATA
-#define _H_GRB_ALGORITHMS_HPCG_DATA
-
-#include <vector>
-#include <cstddef>
-
-#include <graphblas.hpp>
-
-
-namespace grb {
-
-	namespace algorithms {
-
-		/**
-		 * @brief basic data container for the HPCG algorithm, storing \b only the
-		 * data in common between the full CG run and the V-cycle multi-grid solver.
-		 * Additional data are stored in inheriting daata structures.
-		 *
-		 * @tparam IOType type of values of the vectors for intermediate results
-		 * @tparam NonzeroType type of the values stored inside the system matrix #A
-		 */
-		template< typename IOType, typename NonzeroType >
-		struct system_data {
-
-			const std::size_t system_size; ///< size of the system, i.e. side of the #A
-
-			grb::Matrix< NonzeroType > A;                   ///< system matrix
-			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
-			grb::Vector< IOType > z;                        ///< multi-grid solution
-			grb::Vector< IOType > r;                        ///< residual
-			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
-			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
-
-			/**
-			 * @brief Constructor building all the stored vectors and matrices.
-			 *
-			 * Stored vectors and matrices are constructed according to \p sys_size but \b not initialized
-			 * to any value internally, as initialization is up to users's code.
-			 *
-			 * @param[in] sys_size the size of the underlying physical system, i.e. the size of vectors and the number
-			 * of rows and columns of the #A matrix.
-			 */
-			system_data( std::size_t sys_size ) :
-				system_size( sys_size ), A( sys_size, sys_size ), A_diagonal( sys_size ), z( sys_size ), r( sys_size ),
-				// temp(sys_size),
-				smoother_temp( sys_size ) {}
-
-			// for safety, disable copy semantics
-			system_data( const system_data & o ) = delete;
-
-			system_data & operator=( const system_data & ) = delete;
-		};
-
-		/**
-		 * @brief Data container for all multi-grid inputs and outputs.
-		 *
-		 * @tparam IOType Type of values of the vectors for intermediate results
-		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
-		 *                     and the coarsening matrix #Ax_finer
-		 *
-		 * This data structure stores information for a full multi-grid V cycle, i.e.
-		 * - input and output vectors for solution, residual and temporary vectors
-		 * - coarsening information, in particular the #coarsening_matrix that
-		 *   coarsens a larger system of size #finer_size to the current system
-		 *   of size #system_size
-		 * - the next level of coarsening, pointed to by #coarser_level, possibly being \c nullptr
-		 *   if no further coarsening is desired; note that this information is automatically
-		 *   destructed on object destruction (if any)
-		 *
-		 * Vectors stored here refer to the \b coarsened system (with the exception of #Ax_finer),
-		 * thus having size #system_size; this also holds for the system matrix #A,
-		 * while #coarsening_matrix has size #system_size \f$ \times \f$ #finer_size.
-		 * Hence, the typical usage of this data structure is to coarsen \b external vectors, e.g. vectors
-		 * coming from another \code multi_grid_data<IOType, NonzeroType> \endcode object whose #system_size equals
-		 * \code this-> \endcode #fines_size, via \code this-> \endcode #coarsening_matrix and store the coarsened
-		 * vectors internally. Mimicing the recursive behavior of standard multi-grid simulations,
-		 * the information for a further coarsening is stored inside #coarser_level, so that the
-		 * hierarchy of coarsened levels is reflected inside this data structure.
-		 *
-		 * As for \ref system_data, internal vectors and matrices are initialized to the proper size,
-		 * but their values are \b not initialized.
-		 */
-		template< typename IOType, typename NonzeroType >
-		struct multi_grid_data : public system_data< IOType, NonzeroType > {
-
-			const std::size_t finer_size; ///< ssize of the finer system to coarse from;
-			///< typically \c finer_size \code == 8 * \endcode #system_size
-
-			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
-
-			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
-			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
-
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< pointer to next coarsening level, for recursive
-			                                                               ///< multi-grid V cycle implementations
-
-			/**
-			 * @brief Construct a new \c multi_grid_data_object by initializing internal data structures and setting
-			 *        #coarser_level to \c nullptr.
-			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
-			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
-			 */
-			multi_grid_data( std::size_t coarser_size, std::size_t _finer_size ) :
-				system_data< IOType, NonzeroType >( coarser_size ), finer_size( _finer_size ), Ax_finer( finer_size ), coarsening_matrix( coarser_size, finer_size ) {
-				coarser_level = nullptr;
-			}
-
-			/**
-			 * @brief Destroys the \c multi_grid_data_object object by destroying #coarser_level.
-			 */
-			virtual ~multi_grid_data() {
-				if( coarser_level != nullptr ) {
-					delete coarser_level;
-				}
-			}
-		};
-
-		/**
-		 * @brief Data stucture to store the data for a full HPCG run: system vectors and matrix,
-		 * coarsening information and temporary vectors.
-		 *
-		 * This data structures contains all the needed vectors and matrices to solve a linear system
-		 * \f$ A x = b \f$. As for \ref system_data, internal elements are built and their sizes properly initialized
-		 * to #system_size, but internal values are \b not initialized, as they are left to user's logic.
-		 * Similarly, the coarsening information in #coarser_level is to be initialized by users by properly
-		 * building a \code multi_grid_data<IOType, NonzeroType> \endcode object and storing its pointer into
-		 * #coarser_level; on destruction, #coarser_level will also be properly destroyed without
-		 * user's intervention.
-		 *
-		 * @tparam IOType type of values of the vectors for intermediate results
-		 * @tparam NonzeroType type of the values stored inside the system matrix #A
-		 * @tparam InputType type of the values of the right-hand side vector #b
-		 */
-		template< typename IOType, typename NonzeroType, typename InputType >
-		struct hpcg_data : public system_data< IOType, NonzeroType > {
-
-			grb::Vector< InputType > b; ///< right-side vector of known values
-			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
-			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
-			grb::Vector< IOType > x;    // system solution being refined over the iterations: it us up to the user
-			///< to set the initial solution value
-
-			struct multi_grid_data< IOType, NonzeroType > * coarser_level; ///< information about the coarser system, for
-			                                                               ///< the multi-grid run
-
-			/**
-			 * @brief Construct a new \c hpcg_data object by building vectors and matrices and by setting
-			 * #coarser_level to \c nullptr (i.e. no coarser level is assumed).
-			 *
-			 * @param[in] sys_size the size of the simulated system, i.e. of all the internal vectors and matrices
-			 */
-			hpcg_data( std::size_t sys_size ) : system_data< IOType, NonzeroType >( sys_size ), b( sys_size ), u( sys_size ), p( sys_size ), x( sys_size ) {
-				coarser_level = nullptr;
-			}
-
-			/**
-			 * @brief Destroy the \c hpcg_data object by destroying the #coarser_level informartion, if any.
-			 */
-			virtual ~hpcg_data() {
-				if( coarser_level != nullptr ) {
-					delete coarser_level;
-				}
-			}
-		};
-
-	} // namespace algorithms
-
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_HPCG_DATA
-
diff --git a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp b/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
deleted file mode 100644
index 1facabe49..000000000
--- a/include/graphblas/algorithms/hpcg/matrix_building_utils.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hpcg_matrix_building_utils.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build the matrices for HPCG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <numeric>
-#include <stdexcept>
-#include <utility>
-
-#include <graphblas.hpp>
-
-#include "ndim_matrix_builders.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-
-		/**
-		 * @brief Builds a \p DIMS -dimensional system matrix for HPCG simulation.
-		 *
-		 * This routine initializes \p M to a matrix representing a \p DIMS -dimensions system of sizes
-		 * \p sys_sizes, with an iteration halo of size \p halo_size . The matrix diagonal values are initialized
-		 * to \p diag_value while the other non-zero values are initialized to \p non_diag_value .
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed
-		 * @param sys_sizes the sizes of the physical system
-		 * @param halo_size the size of the halo of point to iterate in
-		 * @param diag_value diagonal value
-		 * @param non_diag_value value outside of the diagonal
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_system_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & sys_sizes, std::size_t halo_size, T diag_value, T non_diag_value ) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t n { std::accumulate( sys_sizes.cbegin(), sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			if( grb::nrows( M ) != n || grb::nrows( M ) != grb::ncols( M ) ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be square"
-											" and in accordance with given system "
-											"sizes" );
-			}
-			grb::algorithms::matrix_generator_iterator< DIMS, T > begin( sys_sizes, 0UL, halo_size, diag_value, non_diag_value );
-			grb::algorithms::matrix_generator_iterator< DIMS, T > end( sys_sizes, n, halo_size, diag_value, non_diag_value );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
-		}
-
-		/**
-		 * @brief Builds a coarsener matrix for an HPCG simulation.
-		 *
-		 * It initializes \p M as a rectangular matrix, with rows corresponding to the coarser system
-		 * (of dimensions \p coarser_sizes - output) and columns corresponding to the finer system
-		 * (of dimensions \p finer_sizes - input). The resulting coarsening matrix takes in input the finer system
-		 * and coarsens it by keeping one element every \a S , where \a S is the ratio between the finer and
-		 * the coarser dimension (computed for each dimension). In this way each \p DIMS -dimensional finer element
-		 * corresponds to its bounding coarser element.
-		 *
-		 * For the coarsening to be feasible, the sizes of the finer system \b must be a multiple of those of the
-		 * coarser system. If this condition is not met, an exception is thrown.
-		 *
-		 * @tparam DIMS system dimensions
-		 * @tparam T type of matrix values
-		 * @tparam B matrix GraphBLAS backend
-		 * @param M the matrix to be initialized; it must be already constructed with proper dimensions
-		 * @param coarser_sizes sizes of the coarser system
-		 * @param finer_sizes sizes of the finer system; each one \b must be a multiple of the corresponding value
-		 *                    in \p coarser_size , otherwise an exception is thrown
-		 * @return grb::RC the success value returned when trying to build the matrix
-		 */
-		template< std::size_t DIMS, typename T, enum grb::Backend B >
-		grb::RC build_ndims_coarsener_matrix( grb::Matrix< T, B > & M, const std::array< std::size_t, DIMS > & coarser_sizes, const std::array< std::size_t, DIMS > & finer_sizes ) {
-			static_assert( DIMS > 0, "DIMS must be > 0" );
-			std::size_t const rows { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			for( std::size_t i { 0 }; i < coarser_sizes.size(); i++ ) {
-				std::size_t step = finer_sizes[ i ] / coarser_sizes[ i ];
-				if( step * coarser_sizes[ i ] != finer_sizes[ i ] ) {
-					throw std::invalid_argument( "finer sizes should be a multiple of "
-												"coarser sizes" );
-				}
-			}
-			std::size_t const cols { std::accumulate( finer_sizes.cbegin(), finer_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
-				throw std::invalid_argument( "wrong matrix dimensions: matrix should "
-											"be rectangular"
-											" with rows == <product of coarser sizes> "
-											"and cols == <product of finer sizes>" );
-			}
-
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > begin( coarser_sizes, finer_sizes, 0 );
-			grb::algorithms::coarsener_generator_iterator< DIMS, T > end( coarser_sizes, finer_sizes, rows );
-			return buildMatrixUnique( M, begin, end, grb::IOMode::SEQUENTIAL );
-		}
-
-		/**
-		 * @brief Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
-		 *
-		 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
-		 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
-		 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
-		 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
-		 *
-		 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
-		 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
-		 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
-		 *
-		 * @tparam B GraphBLAS backend for the vector
-		 * @param masks output vector of color masks
-		 * @param matrix_size size of the system matrix
-		 * @param colors numbers of colors masks to build; it must be < \p matrix_size
-		 * @return grb::RC the success value returned when trying to build the vector
-		 */
-		template< enum grb::Backend B >
-		grb::RC build_static_color_masks( std::vector< grb::Vector< bool, B > > & masks, std::size_t matrix_size, std::size_t colors ) {
-			if( ! masks.empty() ) {
-				throw std::invalid_argument( "vector of masks is expected to be "
-											"empty" );
-			}
-			if( matrix_size < colors ) {
-				throw std::invalid_argument( "syztem size is < number of colors: too "
-											"small" );
-			}
-			grb::RC rc { grb::SUCCESS };
-			masks.reserve( colors );
-			for( std::size_t i { 0U }; i < colors; i++ ) {
-				// build in-place, assuming the compiler deduces the right constructor according to B
-				masks.emplace_back( matrix_size );
-				grb::Vector< bool > & mask = masks.back();
-				// grb::set(mask, false); // DO NOT initialize false's explicitly, otherwise
-				// RBGS will touch them too and the runtime will increase!
-				for( std::size_t j = i; j < matrix_size; j += colors ) {
-					rc = grb::setElement( mask, true, j );
-					assert( rc == grb::SUCCESS );
-					if( rc != grb::SUCCESS )
-						return rc;
-				}
-			}
-			return rc;
-		}
-
-	} // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MATRIX_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp b/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
deleted file mode 100644
index f40296f91..000000000
--- a/include/graphblas/algorithms/hpcg/multigrid_v_cycle.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file multigrid_v_cycle.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief This file contains the routines for multi-grid solution refinement, including the main routine
- *        and those for coarsening and refinement of the tentative solution.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
-#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
-
-#include <cassert>
-#include <vector>
-
-#include <graphblas.hpp>
-
-#include "hpcg_data.hpp"
-#include "red_black_gauss_seidel.hpp"
-
-
-namespace grb {
-	namespace algorithms {
-		/**
-		 * @brief Namespace for interfaces that should not be used outside of the algorithm namespace.
-		 */
-		namespace internal {
-
-			/**
-			 * @brief computes the coarser residual vector \p coarsening_data.r by coarsening
-			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
-			 *
-			 * The coarsening information are stored inside \p coarsening_data.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
-			 * @param[in] r_fine fine residual vector
-			 * @param[in,out] coarsening_data \ref multi_grid_data data structure storing the information for coarsening
-			 * @param[in] ring the ring to perform the operations on
-			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType,
-				typename NonzeroType,
-				class Ring,
-				class Minus >
-			grb::RC compute_coarsening( const grb::Vector< IOType > & r_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
-				ret = ret ? ret : grb::eWiseApply( coarsening_data.Ax_finer, r_fine, coarsening_data.Ax_finer,
-									  minus ); // Ax_finer = r_fine - Ax_finer
-				assert( ret == SUCCESS );
-
-				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
-				// to *coarsening_data->system_size
-				ret = ret ? ret : grb::set( coarsening_data.r, 0 );
-				ret = ret ? ret : grb::mxv( coarsening_data.r, coarsening_data.coarsening_matrix, coarsening_data.Ax_finer,
-									  ring ); // r = coarsening_matrix * Ax_finer
-				return ret;
-			}
-
-			/**
-			 * @brief computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
-			 * \p x_fine.
-			 *
-			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[out] x_fine the solution vector to store the prolonged solution into
-			 * @param[in,out] coarsening_data information for coarsening
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 * unsuccessful operation otherwise
-			 */
-			template< typename IOType,
-				typename NonzeroType,
-				class Ring >
-			grb::RC compute_prolongation( grb::Vector< IOType > & x_fine, // fine residual
-				struct multi_grid_data< IOType, NonzeroType > & coarsening_data,
-				const Ring & ring ) {
-				RC ret { SUCCESS };
-				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
-				// to nrows(x_fine)
-				ret = ret ? ret : set( coarsening_data.Ax_finer, 0 );
-
-				ret = ret ? ret : grb::mxv< grb::descriptors::transpose_matrix >( coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, coarsening_data.z, ring );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : grb::foldl( x_fine, coarsening_data.Ax_finer, ring.getAdditiveMonoid() ); // x_fine += Ax_finer;
-				assert( ret == SUCCESS );
-				return ret;
-			}
-
-			/**
-			 * @brief Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother, with inputs and outputs stored
-			 * inside \p data.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[in,out] data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *                     residual, system matrix colors, temporary vectors
-			 * @param[in] smoother_steps how many smoothing steps to run
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC run_smoother( system_data< IOType, NonzeroType > & data, const std::size_t smoother_steps, const Ring & ring ) {
-				RC ret { SUCCESS };
-
-				for( std::size_t i { 0 }; i < smoother_steps && ret == SUCCESS; i++ ) {
-					ret = ret ? ret : red_black_gauss_seidel( data, ring );
-					assert( ret == SUCCESS );
-				}
-				return ret;
-			}
-
-			/**
-			 * @brief Multi-grid V cycle implementation to refine a given solution.
-			 *
-			 * A full multi-grid run goes through the following steps:
-			 * -# if \p presmoother_steps \f$ > 0 \f$, \p presmoother_steps of the Red-Black Gauss-Seidel smoother are run
-			 *    to improve on the initial solution stored into \p data.z
-			 * -# the coarsening of \f$ r - A*z \f$ is computed to find the coarser residual vector
-			 * -# a multi-grid run is recursively performed on the coarser system
-			 * -# the tentative solution from the coarser multi-grid run is prolonged and added to the current tentative solution
-			 *    into \p data.z
-			 * -# this solution is further smoothed for \p postsmoother_steps steps
-			 *
-			 * If coarsening information is not available, the multi-grid run consists in a single smmothing run.
-			 *
-			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution and by returning
-			 * the failure code.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 * @tparam Minus the minus operator for subtractions
-			 *
-			 * @param[in,out] data \ref multi_grid_data object storing the relevant data for the multi-grid run of the current
-			 *                     clevel
-			 * @param[in,out] coarsening_data pointer to information for the coarsening/refinement operations and for the
-			 *                recursive multi-grid run on the coarsened system; if \c nullptr, no coarsening/refinement occurs
-			 *                and only smoothing occurs on the current solution
-			 * @param[in] presmoother_steps number of pre-smoother steps
-			 * @param[in] postsmoother_steps number of post-smoother steps
-			 * @param[in] ring the ring to perform the operations on
-			 * @param[in] minus the \f$ - \f$ operator for vector subtractions
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring, class Minus >
-			grb::RC multi_grid( system_data< IOType, NonzeroType > & data,
-				struct multi_grid_data< IOType, NonzeroType > * const coarsening_data,
-				const size_t presmoother_steps,
-				const size_t postsmoother_steps,
-				const Ring & ring,
-				const Minus & minus ) {
-				RC ret { SUCCESS };
-#ifdef HPCG_PRINT_STEPS
-				DBG_println( "mg BEGINNING {" );
-#endif
-
-				// clean destination vector
-				ret = ret ? ret : grb::set( data.z, 0 );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.r, "initial r" );
-#endif
-				if( coarsening_data == nullptr ) {
-					// compute one round of Gauss Seidel and return
-					ret = ret ? ret : run_smoother( data, 1, ring );
-					assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-					DBG_print_norm( data.z, "smoothed z" );
-					DBG_println( "} mg END" );
-#endif
-					return ret;
-				}
-
-				struct multi_grid_data< IOType, NonzeroType > & cd {
-					*coarsening_data
-				};
-
-				// pre-smoother
-				ret = ret ? ret : run_smoother( data, presmoother_steps, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "pre-smoothed z" );
-#endif
-
-				ret = ret ? ret : grb::set( cd.Ax_finer, 0 );
-				ret = ret ? ret : grb::mxv( cd.Ax_finer, data.A, data.z, ring );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : compute_coarsening( data.r, cd, ring, minus );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( cd.r, "coarse r" );
-#endif
-
-				ret = ret ? ret : multi_grid( cd, cd.coarser_level, presmoother_steps, postsmoother_steps, ring, minus );
-				assert( ret == SUCCESS );
-
-				ret = ret ? ret : compute_prolongation( data.z, cd, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "prolonged z" );
-#endif
-
-				// post-smoother
-				ret = ret ? ret : run_smoother( data, postsmoother_steps, ring );
-				assert( ret == SUCCESS );
-#ifdef HPCG_PRINT_STEPS
-				DBG_print_norm( data.z, "post-smoothed z" );
-				DBG_println( "} mg END" );
-#endif
-
-				return ret;
-			}
-
-		} // namespace internal
-	}     // namespace algorithms
-} // namespace grb
-
-#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp b/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
deleted file mode 100644
index c00eb65b2..000000000
--- a/include/graphblas/algorithms/hpcg/ndim_matrix_builders.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file ndim_matrix_builders.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build matrices for an HPCG simulation in a generic number of dimensions
- *
- * In particular, the main matrices are:
- * - a system matrix, generated from an N-dimenional space of coordinates by iterating along
- *   each dimension in priority order, where the first dimension has highest priority and the last
- *   dimension least priority; for each point (row), all its N-dimensional neighbours within
- *   a given distance are generated for the column
- * - a coarsening matrix, generated by iterating on a coarser system of N dimensions (row) and projecting
- *   each point to a corresponding system of finer sizes
- *
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
-#define _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <initializer_list>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-
-namespace grb {
-
-	namespace algorithms {
-
-		/**
-		 * @brief Base class that iterates on DIMS dimensions starting from the first one.
-		 *
-		 * The coordinates are assumed to generate the row number in a matrix whose number of rows is
-		 * the product of all sizes. This class generates row numbers for physical problems described as
-		 * systems of linear equations in an n-dimensional space.
-		 *
-		 * Example of iterations in a 3D (x, y, z) system of size (4,3,2), with generated row numbers
-		 * reported as '=> ROW':
-		 * - z[0]
-		 * - y[0]
-		 * - x[0] => 0, x[1] => 1, x[2] => 2, x[3] => 3
-		 * - y[1]
-		 * - x[0] => 4, x[1] => 5, x[2] => 6, x[3] => 7
-		 * - y[2]
-		 * - x[0] => 8, x[1] => 9, x[2] => 10, x[3] => 11
-		 * - z[1]
-		 * - y[0]
-		 * - x[0] => 12, x[1] => 13, x[2] => 14, x[3] => 15
-		 * - y[1]
-		 * - x[0] => 16, x[1] => 17, x[2] => 18, x[3] => 19
-		 * - y[2]
-		 * - x[0] => 20, x[1] => 21, x[2] => 22, x[3] => 23
-		 *
-		 * The main goal of this class is to be derived by other classes to generate matrices in an
-		 * STL-iterator-fashion; hence, this class contains all the code for basic coordinate-to-row-column
-		 * conversion in \p DIM dimensions and the basic logic to increment the row number.
-		 *
-		 * @tparam DIMS number os dimensions of the system
-		 */
-		template< std::size_t DIMS >
-		struct row_generator {
-
-			using RowIndexType = std::size_t; ///< numeric type of rows
-			using array_t = std::array< RowIndexType,
-				DIMS >; ///< type for the array storing the coordinates.
-
-			const array_t physical_sizes; ///< size of each dimension, starting from the one to be explored first
-
-			/**
-			 * @brief Construct a new row generator object
-			 * @param[in] _sizes array of sizes of each dimension; no dimension should be 0, otherwise an exception
-			 *                   is thrown
-			 * @param[in] first_row first row to iterate from; it is allowed to be beyond the matrix size, e.g. to create
-			 *                      an end iterator (no check occurs)
-			 */
-			row_generator( const array_t & _sizes, RowIndexType first_row ) : physical_sizes( _sizes ) {
-				static_assert( DIMS > 0, "DIMS should be higher than 0" );
-				for( const auto i : _sizes ) {
-					if( i == static_cast< RowIndexType >( 0U ) ) {
-						throw std::invalid_argument( "All dimension sizes must "
-													 "be > 0" );
-					}
-				}
-				row_to_coords( first_row );
-			}
-
-			row_generator( const row_generator & o ) = default;
-
-			row_generator( row_generator && o ) = default;
-
-		protected:
-			// x: row_coords[0], y: row_coords[1], z: row_coords[2], ...
-			array_t row_coords; ///< n-D coordinates from which to compute the row
-
-			/**
-			 * @brief converts a row number into a n-D coordinates according to the sizes in #physical_sizes
-			 *
-			 * In case the input is higher than the nunber of rows, the last coordinate is allowed to
-			 * go beyond its physical size. E.g., if the system has size (4,3,2) and \p rowcol is 24,
-			 * the coordinates are (0,0,3).
-			 *
-			 * @param[in] rowcol row number to convert; it can be any number
-			 */
-			void row_to_coords( RowIndexType rowcol ) {
-				std::size_t s = 1;
-				for( std::size_t i { 0 }; i < row_coords.size() - 1; i++ )
-					s *= physical_sizes[ i ];
-
-				for( typename array_t::size_type i { row_coords.size() - 1 }; i > 0; i-- ) {
-					row_coords[ i ] = rowcol / s;
-					rowcol -= row_coords[ i ] * s;
-					s /= physical_sizes[ i ];
-				}
-				row_coords[ 0 ] = rowcol % physical_sizes[ 0 ];
-			}
-
-			/**
-			 * @brief Pure function converting an array of coordinates into a row number, based on #physical_sizes.
-			 * @param a the #array_t array of coordinates to convert
-			 * @return #RowIndexType the row corresponding to the coordinates in \p a
-			 */
-			RowIndexType coords_to_rowcol( const array_t & a ) const {
-				RowIndexType row { 0 };
-				RowIndexType s { 1 };
-				for( typename array_t::size_type i { 0 }; i < a.size(); i++ ) {
-					row += s * a[ i ];
-					s *= physical_sizes[ i ];
-				}
-				return row;
-			}
-
-			/**
-			 * @brief Increment #row_coords in order to move to the next coordinate (according to the
-			 * n-dimensional iteration order) and update #current_row accordingly.
-			 *
-			 * To be used by derived classes in order to generate the matrix, e.g. via the \c operator()++
-			 * operator prescribed for STL-like iterators.
-			 */
-			void increment_row() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & coord = row_coords[ i ];
-					// must rewind dimension if we wrap-around
-					typename array_t::value_type new_coord = ( coord + 1 ) % physical_sizes[ i ];
-					rewind = new_coord < coord;
-					coord = new_coord;
-					++i;
-				} while( rewind && i < row_coords.size() - 1 ); // rewind only the first N-1 coordinates
-
-				// if we still have to rewind, increment the last coordinate, which is unbounded
-				if( rewind ) {
-					row_coords.back()++;
-				}
-			}
-		};
-
-		// ===============================================================
-
-		/**
-		 * @brief STL-like iterable class to generate the values for a matrix by iterating in an n-dimensional
-		 * space along the coordinates.
-		 *
-		 * For each \f$ X=(x0, x1, ...,xn) \f$ point of the underlying (n+1)-dimensional space,
-		 * this class iterates through the points of the n-dimensional halo of radius \p halo around \f$ X \f$,
-		 * generating the row number corresponding to \f$ X \f$ and the column number corresponding to
-		 * each halo point. At each coordinate \code (row, col) \endcode generated this way, the corresponding matrix value
-		 * being generated depends on whether \code row == col \endcode.
-		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam HALO halo size, determining the number of points to iterate around and thus the column coordinates
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T = double >
-		struct matrix_generator_iterator : public row_generator< DIMS > {
-
-			using RowIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ColumnIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ValueType = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< RowIndexType, ColumnIndexType >, T >;
-
-			// halo may in future become a DIM-size array to iterate in arbitrary shapes
-			const RowIndexType halo;              ///< number of points per dimension to iterate around
-			const ValueType diagonal_value;     ///< value to be emitted when the object has moved to the diagonal
-			const ValueType non_diagonal_value; ///< value to emit outside of the diagonal
-
-			/**
-			 * @brief Construct a new \c matrix_generator_iterator object, setting the current row as \p row
-			 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
-			 *
-			 * @param sizes array with the sizes along the dimensions
-			 * @param row current row to initialize the matrix on
-			 * @param _halo halo of points to iterate around; must be > 0
-			 * @param diag value to emit when on the diagonal
-			 * @param non_diag value to emit outside the diagonal
-			 */
-			matrix_generator_iterator( const array_t & sizes, RowIndexType row, RowIndexType _halo, ValueType diag, ValueType non_diag ) :
-				row_generator< DIMS >( sizes, row ), halo( _halo ), diagonal_value( diag ), non_diagonal_value( non_diag ) {
-				if( halo <= 0 ) {
-					throw std::invalid_argument( "halo should be higher than 0" );
-				}
-				for( const auto i : sizes ) {
-					if( i < static_cast< RowIndexType >( 2 * halo + 1 ) ) {
-						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
-					}
-				}
-				current_values.first.first = row;
-				update_column_max_values();
-				reset_all_columns();
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = v();
-			}
-
-			matrix_generator_iterator( const matrix_generator_iterator & o ) = default;
-
-			matrix_generator_iterator( matrix_generator_iterator && o ) = default;
-
-			/**
-			 * @brief Increments the iterator by moving coordinates to the next (row, column) to iterate on.
-			 *
-			 * This operator internally increments the columns coordinates until wrap-around, when it increments
-			 * the row coordinates and resets the column coordinates to the first possible columns; this column coordinate
-			 * depends on the row coordinates according to the dimensions iteration order and on the parameter \p halo.
-			 *
-			 * @return matrix_generator_iterator<DIMS, T>& \c this object, with the updated state
-			 */
-			matrix_generator_iterator< DIMS, T > & operator++() {
-				bool must_rewind = increment_column();
-				if( must_rewind ) {
-					this->increment_row();
-					// after changing row, we must find the first non-zero column
-					reset_all_columns();
-					current_values.first.first = this->coords_to_rowcol( this->row_coords );
-					update_column_max_values();
-				}
-				// trigger column update after row update, as a row update
-				// triggers a column update
-				current_values.first.second = this->coords_to_rowcol( col_coords );
-				current_values.second = this->v();
-				return *this;
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they differ.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator!=( const matrix_generator_iterator< DIMS, T > & o ) const {
-				if( o.i() != this->i() ) {
-					return true;
-				}
-				return o.j() != this->j();
-			}
-
-			/**
-			 * @brief Operator to compare \c this against \p o  and return whether they are equal.
-			 *
-			 * @param o object to compare \c this against
-			 * @return true of the row or the column is different between \p o and \c this
-			 * @return false if both row and column of \p o and \c this are equal
-			 */
-			bool operator==( const matrix_generator_iterator< DIMS, T > & o ) const {
-				return o.i() == this->i() && o.j() == this->j();
-			}
-
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
-			const value_type & operator*() const {
-				return current_values;
-			}
-
-			/**
-			 * @brief Returns current row.
-			 */
-			inline RowIndexType i() const {
-				return current_values.first.first;
-			}
-
-			/**
-			 * @brief Returns current column.
-			 */
-			inline ColumnIndexType j() const {
-				return current_values.first.second;
-			}
-
-			/**
-			 * @brief Returns the current matrix value.
-			 *
-			 * @return ValueType #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
-			 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
-			 */
-			inline ValueType v() const {
-				return j() == i() ? diagonal_value : non_diagonal_value;
-			}
-
-		private:
-			// offsets w.r.t. rows
-			array_t col_coords;        ///< coordinates corresponding to current column
-			array_t column_max_values; ///< maximum values for the column coordinates, to stop column increment
-			//// and reset the column coordinates
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
-
-			/**
-			 * @brief Updates the maximum values each column coordinate can reach, according to the row coordinates.
-			 *
-			 * To be called after each row coordinates update.
-			 */
-			void update_column_max_values() {
-				for( std::size_t i { 0 }; i < column_max_values.size(); i++ ) {
-					column_max_values[ i ] = std::min( this->physical_sizes[ i ] - 1, this->row_coords[ i ] + halo );
-				}
-			}
-
-			/**
-			 * @brief Resets the value of column dimension \p dim to the first possible value.
-			 *
-			 * The final value of #col_coords[dim] depends on the current row (#row_coords) and on the \p halo
-			 * and is \f$ max(0, \f$ #row_coords \f$[dim])\f$.
-			 *
-			 * @param dim the dimension to reset
-			 */
-			void reset_column_coords( std::size_t dim ) {
-				// cannot use std::max because row_coords is unsigned and can wrap-around
-				col_coords[ dim ] = this->row_coords[ dim ] <= halo ? 0 : ( this->row_coords[ dim ] - halo );
-			}
-
-			/**
-			 * @brief resets all values in #col_coords to the initial coordinates,
-			 * iterating from on the current row.
-			 */
-			void reset_all_columns() {
-				for( std::size_t i { 0 }; i < col_coords.size(); i++ ) {
-					reset_column_coords( i );
-				}
-			}
-
-			/**
-			 * @brief Increment the column according to the iteration order, thus resetting the column coordinates
-			 * when the last possible column value for the current row has been reached.
-			 *
-			 * @return true if the column coordinates have been reset, and thus also the row must be incremented
-			 * @return false if the column coordinates
-			 */
-			bool increment_column() {
-				bool rewind;
-				typename array_t::size_type i { 0 };
-				do {
-					typename array_t::value_type & col = col_coords[ i ];
-					// must rewind dimension if the column offset is already at the max value
-					// or if the column coordinates are already at the max value
-					rewind = ( col == column_max_values[ i ] );
-					if( rewind ) {
-						// col = this->row_coords[i] == 0 ? 0 : this->row_coords[i] - (halo);
-						reset_column_coords( i );
-					} else {
-						++col;
-					}
-					++i;
-				} while( rewind && i < col_coords.size() );
-
-				// if we change z, then we also must reset x and y; if only y, we must reset x, and so on
-				return rewind;
-			}
-		};
-
-		// ===============================================================
-
-		/**
-		 * @brief Class to generate the coarsening matrix of an underlying \p DIMS -dimensional system.
-		 *
-		 * This class coarsens a finer system to a coarser system by projecting each input value (column),
-		 * espressed in finer coordinates, to an output (row) value espressed in coarser coordinates.
-		 * The coarser sizes are assumed to be row_generator#physical_sizes, while the finer sizes are here
-		 * stored inside #finer_sizes.
-		 *
-		 * The corresponding refinement matrix is obtained by transposing the coarsening matrix.
-		 *
-		 * @tparam DIMS number of dimensions of the system
-		 * @tparam T type of matrix values
-		 */
-		template< std::size_t DIMS, typename T = double >
-		struct coarsener_generator_iterator : public row_generator< DIMS > {
-
-			using RowIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ColumnIndexType = typename row_generator< DIMS >::RowIndexType;
-			using ValueType = T;
-			using array_t = typename row_generator< DIMS >::array_t;
-			using value_type = std::pair< std::pair< RowIndexType, ColumnIndexType >, T >;
-
-			// the sizes to project from
-			const array_t finer_sizes; ///< the size of the finer system (columns)
-			array_t steps;             ///< array of steps, i.e. how much each column coordinate (finer system) must be
-			//// incremented when incrementing the row coordinates; is is the ration between
-			//// #finer_sizes and row_generator#physical_sizes
-
-			/**
-			 * @brief Construct a new \c coarsener_generator_iterator object from the coarser and finer sizes,
-			 * setting its row at \p _current_row and the column at the corresponding value.
-			 *
-			 * Each finer size <b>must be an exact multiple of the corresponding coarser size</b>, otherwise the
-			 * construction will throw an exception.
-			 *
-			 * @param _coarser_sizes sizes of the coarser system (rows)
-			 * @param _finer_sizes sizes of the finer system (columns)
-			 * @param _current_row row (in the coarser system) to set the iterator on
-			 */
-			coarsener_generator_iterator( const array_t & _coarser_sizes, const array_t & _finer_sizes, RowIndexType _current_row ) :
-				row_generator< DIMS >( _coarser_sizes, _current_row ), finer_sizes( _finer_sizes ), steps( { 0 } ) {
-				for( std::size_t i { 0 }; i < DIMS; i++ ) {
-					// finer size MUST be an exact multiple of coarser_size
-					typename array_t::value_type step { _finer_sizes[ i ] / _coarser_sizes[ i ] };
-					if( step == 0 || finer_sizes[ i ] / step != this->physical_sizes[ i ] ) {
-						throw std::invalid_argument( std::string( "finer size "
-																  "of "
-																  "dimension"
-																  " " ) +
-							std::to_string( i ) +
-							std::string( "is not an exact multiple of coarser "
-										 "size" ) );
-					}
-					steps[ i ] = step;
-				}
-				current_values.first.first = _current_row;
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
-			}
-
-			coarsener_generator_iterator( const coarsener_generator_iterator & o ) = default;
-
-			coarsener_generator_iterator( coarsener_generator_iterator && o ) = default;
-
-			/**
-			 * @brief Increments the row and the column according to the respective physical sizes,
-			 * thus iterating onto the coarsening matrix coordinates.
-			 *
-			 * @return \code *this \endcode, i.e. the same object with the updates row and column
-			 */
-			coarsener_generator_iterator< DIMS, T > & operator++() {
-				this->increment_row();
-				current_values.first.first = this->coords_to_rowcol( this->row_coords );
-				current_values.first.second = coords_to_finer_col();
-				current_values.second = v();
-				return *this;
-			}
-
-			/**
-			 * @brief Returns whether \c this and \p o differ.
-			 */
-			bool operator!=( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				if( this->i() != o.i() ) {
-					return true;
-				}
-				return this->j() != o.j();
-			}
-
-			/**
-			 * @brief Returns whether \c this and \p o are equal.
-			 */
-			bool operator==( const coarsener_generator_iterator< DIMS, T > & o ) const {
-				return this->i() == o.i() && this->j() == o.j();
-			}
-
-			/**
-			 * @brief Operator returning the triple to directly access row, column and element values.
-			 *
-			 * Useful when building the matrix by copying the triple of coordinates and value,
-			 * like for the BSP1D backend.
-			 */
-			const value_type & operator*() const {
-				return current_values;
-			}
-
-			/**
-			 * @brief Returns the current row, according to the coarser system.
-			 */
-			inline RowIndexType i() const {
-				return current_values.first.first;
-			}
-
-			/**
-			 * @brief Returns the current column, according to the finer system.
-			 */
-			inline ColumnIndexType j() const {
-				return current_values.first.second;
-			}
-
-			/**
-			 * @brief Returns always 1, as the coarsening keeps the same value.
-			 */
-			inline ValueType v() const {
-				return static_cast< ValueType >( 1 );
-			}
-
-		private:
-			value_type current_values; ///< triple storing the current value for row, column and matrix element
-
-			/**
-			 * @brief Returns the row coordinates converted to the finer system, to compute
-			 * the column value.
-			 */
-			ColumnIndexType coords_to_finer_col() const {
-				ColumnIndexType row { 0 };
-				ColumnIndexType s { 1 };
-				for( typename array_t::size_type i { 0 }; i < this->row_coords.size(); i++ ) {
-					s *= steps[ i ];
-					row += s * this->row_coords[ i ];
-					s *= this->physical_sizes[ i ];
-				}
-				return row;
-			}
-		};
-
-	} // end namespace algorithms
-
-} // end namespace grb
-
-namespace std {
-
-	/**
-	 * Specialises the standard STL iterator traits for
-	 * #grb::algorithms::matrix_generator_iterator
-	 */
-	template< size_t DIMS, typename T >
-	class iterator_traits<
-		grb::algorithms::matrix_generator_iterator< DIMS, T >
-	> {
-
-		private:
-
-			typedef grb::algorithms::matrix_generator_iterator< DIMS, T > SelfType;
-
-
-		public:
-
-			typedef typename SelfType::ValueType value_type;
-			typedef const value_type * pointer;
-			typedef const value_type & reference;
-			typedef size_t difference_type;
-			typedef forward_iterator_tag iterator_category;
-
-	};
-
-	template< size_t DIMS, typename T >
-	class iterator_traits<
-		grb::algorithms::coarsener_generator_iterator< DIMS, T >
-	> {
-
-		private:
-
-			typedef grb::algorithms::coarsener_generator_iterator< DIMS, T > SelfType;
-
-
-		public:
-
-			typedef typename SelfType::ValueType value_type;
-			typedef const value_type * pointer;
-			typedef const value_type & reference;
-			typedef size_t difference_type;
-			typedef forward_iterator_tag iterator_category;
-
-	};
-
-} // end namespace std
-
-#endif // _H_GRB_ALGORITHMS_NDIM_MATRIX_BUILDERS
-
diff --git a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
deleted file mode 100644
index 718e5015c..000000000
--- a/include/graphblas/algorithms/hpcg/red_black_gauss_seidel.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-
-/*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file red_black_gauss_seidel.hpp
- * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Contains the routines to perform a forward-backward pass of a Red-Black Gauss-Seidel smoother.
- * @date 2021-04-30
- */
-
-#ifndef _H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
-#define _H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
-
-#include <cassert>
-
-#include <graphblas.hpp>
-
-
-namespace grb {
-	namespace algorithms {
-		namespace internal {
-
-			/**
-			 * @brief Runs a single step of Red-Black Gauss-Seidel for a specific color.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param[in] A the system matrix
-			 * @param[in] A_diagonal a vector storing the diagonal elements of \p A
-			 * @param[in] r the residual
-			 * @param[in,out] x the initial solution to start from, and where the smoothed solution is stored to
-			 * @param[out] smoother_temp a vector for temporary values
-			 * @param[in] color_mask the mask of colors to filter the rows to smooth
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC __rbgs_single_step( const grb::Matrix< NonzeroType > & A,
-				const grb::Vector< IOType > & A_diagonal,
-				const grb::Vector< IOType > & r,
-				grb::Vector< IOType > & x,
-				grb::Vector< IOType > & smoother_temp,
-				const grb::Vector< bool > & color_mask,
-				const Ring & ring ) {
-				RC ret { SUCCESS };
-				ret = ret ? ret : grb::set( smoother_temp, 0 );
-
-				// acc_temp[mask] = A[mask] * x[mask]
-				ret = ret ? ret : grb::mxv< grb::descriptors::safe_overlap >( smoother_temp, color_mask, A, x, ring );
-				assert( ret == SUCCESS );
-
-				// TODO internal issue #201
-				// Replace below with masked calls:
-				// x[mask] = r[mask] - smoother_temp[mask] + x[mask] .* diagonal[mask]
-				// x[mask] = x[maks] ./ diagonal[mask]
-				ret = ret ? ret :
-                            grb::eWiseLambda(
-								[ &x, &r, &smoother_temp, &color_mask, &A_diagonal ]( const size_t i ) {
-									// if the mask was properly initialized, the check on the mask value is unnecessary;
-					                // nonetheless, it is left not to violate the semantics of RBGS in case also the false values
-					                // had been initialized (in which case the check is fundamental); if only true values were initialized,
-					                // we expect CPU branch prediction to neutralize the branch cost
-									if( color_mask[ i ] ) {
-										IOType d = A_diagonal[ i ];
-										IOType v = r[ i ] - smoother_temp[ i ] + x[ i ] * d;
-										x[ i ] = v / d;
-									}
-								},
-								color_mask, x, r, smoother_temp, A_diagonal );
-				assert( ret == SUCCESS );
-				return ret;
-			}
-
-			/**
-			 * @brief Runs a single forward and backward pass of Red-Black Gauss-Seidel smoothing on the system stored in \p data.
-			 *
-			 * This routine performs a forward and a backward step of Red-Black Gauss-Seidel for each color stored in \p data.color_masks.
-			 * Color stored inside this container <b>are assumed to be mutually exclusive and to cover all rows of the solution vector<\b>,
-			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic to generate and pass correct
-			 * coloring information. Otherwise, \b no guarantees hold on the result.
-			 *
-			 * @tparam IOType type of result and intermediate vectors used during computation
-			 * @tparam NonzeroType type of matrix values
-			 * @tparam Ring the ring of algebraic operators zero-values
-			 *
-			 * @param data \ref system_data data structure with relevant inpus and outputs: system matrix, initial solution,
-			 *             residual, system matrix colors, temporary vectors
-			 * @param[in] ring the ring to perform the operations on
-			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
-			 *                          unsuccessful operation otherwise
-			 */
-			template< typename IOType, typename NonzeroType, class Ring >
-			grb::RC red_black_gauss_seidel( system_data< IOType, NonzeroType > & data, const Ring & ring ) {
-				RC ret { SUCCESS };
-				// forward step
-				std::vector< grb::Vector< bool > >::const_iterator end { data.color_masks.cend() };
-				for( std::vector< grb::Vector< bool > >::const_iterator it { data.color_masks.cbegin() }; it != end && ret == SUCCESS; ++it ) {
-					ret = ret ? ret : __rbgs_single_step( data.A, data.A_diagonal, data.r, data.z, data.smoother_temp, *it, ring );
-				}
-				// backward step
-				std::vector< grb::Vector< bool > >::const_reverse_iterator rend { data.color_masks.crend() };
-				for( std::vector< grb::Vector< bool > >::const_reverse_iterator rit { data.color_masks.crbegin() }; rit != rend && ret == SUCCESS; ++rit ) {
-					ret = ret ? ret : __rbgs_single_step( data.A, data.A_diagonal, data.r, data.z, data.smoother_temp, *rit, ring );
-				}
-				return ret;
-			}
-
-		} // namespace internal
-	}     // namespace algorithms
-} // namespace grb
-
-#endif // H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
diff --git a/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
new file mode 100644
index 000000000..e412a630c
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/single_point_coarsener.hpp
@@ -0,0 +1,324 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file single_point_coarsener.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to build the coarsening matrix for an HPCG simulation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
+#define _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
+
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+
+#include <graphblas/utils/multigrid/array_vector_storage.hpp>
+#include <graphblas/utils/multigrid/linearized_ndim_system.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+		// forward declaration
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class SinglePointCoarsenerBuilder;
+
+		/**
+		 * Iterator class to generate the coarsening matrix for an HPCG simulation.
+		 *
+		 * The coarsening matrix samples a single value from the finer space for every element
+		 * of the coarser space; this value is the first one (i.e. the one with smallest coordinates)
+		 * in the finer sub-space corresponding to each coarser element.
+		 *
+		 * This coarsening method is simple but can lead to unstable results, especially with certain combinations
+		 * of smoothers and partitioning methods.
+		 *
+		 * This iterator is random-access.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
+		 *  the finer value)
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> struct SinglePointCoarsenerIterator {
+
+			friend SinglePointCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+			using RowIndexType = CoordType; ///< numeric type of rows
+			using ColumnIndexType = CoordType;
+			using LinearSystemType = grb::utils::multigrid::LinearizedNDimSystem< CoordType, grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > >;
+			using LinearSystemIterType = typename LinearSystemType::Iterator;
+			using SelfType = SinglePointCoarsenerIterator< DIMS, CoordType, ValueType >;
+			using ArrayType = std::array< CoordType, DIMS >;
+
+			struct _HPCGValueGenerator {
+
+				friend SelfType;
+
+				_HPCGValueGenerator(
+					RowIndexType i,
+					ColumnIndexType j
+				) noexcept :
+					_i( i ),
+					_j( j ) {}
+
+				_HPCGValueGenerator( const _HPCGValueGenerator & ) = default;
+
+				_HPCGValueGenerator & operator=( const _HPCGValueGenerator & ) = default;
+
+				inline RowIndexType i() const {
+					return _i;
+				}
+				inline ColumnIndexType j() const {
+					return _j;
+				}
+				inline ValueType v() const {
+					return static_cast< ValueType >( 1 );
+				}
+
+			private:
+				RowIndexType _i;
+				ColumnIndexType _j;
+			};
+
+			// interface for std::random_access_iterator
+			using iterator_category = std::random_access_iterator_tag;
+			using value_type = _HPCGValueGenerator;
+			using pointer = const value_type;
+			using reference = const value_type &;
+			using difference_type = typename LinearSystemIterType::difference_type;
+
+			SinglePointCoarsenerIterator( const SelfType & o ) = default;
+
+			SinglePointCoarsenerIterator( SelfType && o ) = default;
+
+			SelfType & operator=( const SelfType & ) = default;
+
+			SelfType & operator=( SelfType && ) = default;
+
+			/**
+			 * Advances \c this by 1 in constant time.
+			 */
+			SelfType & operator++() noexcept {
+				(void)++_sys_iter;
+				update_coords();
+				return *this;
+			}
+
+			/**
+			 * Advances \c this by \p offset in constant time.
+			 */
+			SelfType & operator+=( size_t offset ) {
+				_sys_iter += offset;
+				update_coords();
+				return *this;
+			}
+
+			/**
+			 * Computes the difference between \c this and \p o as integer.
+			 */
+			difference_type operator-( const SelfType & o ) const {
+				return this->_sys_iter - o._sys_iter;
+			}
+
+			/**
+			 * Returns whether \c this and \p o differ.
+			 */
+			bool operator!=( const SelfType & o ) const {
+				return this->_sys_iter != o._sys_iter;
+			}
+
+			/**
+			 * Returns whether \c this and \p o are equal.
+			 */
+			bool operator==( const SelfType & o ) const {
+				return ! this->operator!=( o );
+			}
+
+			reference operator*() const {
+				return _val;
+			}
+
+			pointer operator->() const {
+				return &_val;
+			}
+
+			/**
+			 * Returns the current row, within the coarser system.
+			 */
+			inline RowIndexType i() const {
+				return _val.i();
+			}
+
+			/**
+			 * Returns the current column, within the finer system.
+			 */
+			inline ColumnIndexType j() const {
+				return _val.j();
+			}
+
+			/**
+			 * Returns always 1, as the coarsening keeps the same value.
+			 */
+			inline ValueType v() const {
+				return _val.v();
+			}
+
+		private:
+			const LinearSystemType * _lin_sys;
+			const ArrayType * _steps;
+			LinearSystemIterType _sys_iter;
+			value_type _val;
+
+			/**
+			 * Construct a new SinglePointCoarsenerIterator object starting from the LinearizedNDimSystem
+			 * object \p system describing the \b coarser system and the \b ratios \p steps between each finer and
+			 * the corresponding corser dimension.
+			 *
+			 * @param system LinearizedNDimSystem object describing the coarser system
+			 * @param steps ratios per dimension between finer and coarser system
+			 */
+			SinglePointCoarsenerIterator(
+				const LinearSystemType & system,
+				const ArrayType & steps
+			) noexcept :
+				_lin_sys( &system ),
+				_steps( &steps ),
+				_sys_iter( _lin_sys->begin() ),
+				_val( 0, 0 )
+			{
+				update_coords();
+			}
+
+			void update_coords() noexcept {
+				_val._i = _sys_iter->get_linear_position();
+				_val._j = coarse_rows_to_finer_col();
+			}
+
+			/**
+			 * Returns the row coordinates converted to the finer system, to compute
+			 * the column value.
+			 */
+			ColumnIndexType coarse_rows_to_finer_col() const noexcept {
+				ColumnIndexType finer = 0;
+				ColumnIndexType s = 1;
+				for( size_t i = 0; i < DIMS; i++ ) {
+					s *= ( *_steps )[ i ];
+					finer += s * _sys_iter->get_position()[ i ];
+					s *= _lin_sys->get_sizes()[ i ];
+				}
+				return finer;
+			}
+		};
+
+		/**
+		 * Builder object to create iterators that generate a coarsening matrix.
+		 *
+		 * It is a facility to generate beginning and end iterators and abstract the logic away from users.
+		 *
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam ValueType type of the nonzero: it must be able to represent 1 (the value to sample
+		 *  the finer value)
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class SinglePointCoarsenerBuilder {
+		public:
+			using ArrayType = std::array< CoordType, DIMS >;
+			using Iterator = SinglePointCoarsenerIterator< DIMS, CoordType, ValueType >;
+			using SelfType = SinglePointCoarsenerBuilder< DIMS, CoordType, ValueType >;
+
+			/**
+			 * Construct a new SinglePointCoarsenerBuilder object from the sizes of finer system
+			 * and those of the coarser system; finer sizes must be an exact multiple of coarser sizes,
+			 * otherwise an exception is raised.
+			 */
+			SinglePointCoarsenerBuilder(
+				const ArrayType & _finer_sizes,
+				const ArrayType & _coarser_sizes
+			) :
+				system( _coarser_sizes.begin(),
+				_coarser_sizes.end() )
+			{
+				for( size_t i = 0; i < DIMS; i++ ) {
+					// finer size MUST be an exact multiple of coarser_size
+					std::ldiv_t ratio = std::ldiv( _finer_sizes[ i ], _coarser_sizes[ i ] );
+					if( ratio.quot < 2 || ratio.rem != 0 ) {
+						throw std::invalid_argument( std::string( "finer size of dimension " ) + std::to_string( i ) + std::string( "is not an exact multiple of coarser size" ) );
+					}
+					steps[ i ] = ratio.quot;
+				}
+			}
+
+			SinglePointCoarsenerBuilder( const SelfType & ) = delete;
+
+			SinglePointCoarsenerBuilder( SelfType && ) = delete;
+
+			SelfType & operator=( const SelfType & ) = delete;
+
+			SelfType & operator=( SelfType && ) = delete;
+
+			/**
+			 * Returns the size of the finer system, i.e. its number of elements.
+			 */
+			size_t system_size() const {
+				return system.system_size();
+			}
+
+			/**
+			 * Produces a beginning iterator to generate the coarsening matrix.
+			 */
+			Iterator make_begin_iterator() {
+				return Iterator( system, steps );
+			}
+
+			/**
+			 * Produces an end iterator to stop the generation of the coarsening matrix.
+			 */
+			Iterator make_end_iterator() {
+				Iterator result( system, steps );
+				result += system_size(); // do not trigger boundary checks
+				return result;
+			}
+
+		private:
+			const grb::utils::multigrid::LinearizedNDimSystem< CoordType,
+				grb::utils::multigrid::ArrayVectorStorage< DIMS, CoordType > > system;
+
+			///
+			/// array of steps, i.e. how much each column coordinate (finer system) must be
+			/// incremented when incrementing the row coordinates; it is the ratio between
+			/// #finer_sizes and row_generator#physical_sizes
+			ArrayType steps;
+		};
+
+	} // namespace algorithms
+} // namespace grb
+#endif // _H_GRB_ALGORITHMS_HPCG_SINGLE_POINT_COARSENER
diff --git a/include/graphblas/algorithms/hpcg/system_builder.hpp b/include/graphblas/algorithms/hpcg/system_builder.hpp
new file mode 100644
index 000000000..84600414c
--- /dev/null
+++ b/include/graphblas/algorithms/hpcg/system_builder.hpp
@@ -0,0 +1,181 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/algorithms/hpcg
+ * This folder contains the code specific to the HPCG benchmark implementation: generation of the physical system,
+ * generation of the single point coarsener and coloring algorithm.
+ */
+
+/**
+ * @file system_builders.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to build the system matrix for an HPCG simulation in a generic number of dimensions.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
+#define _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp>
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Builder class to build the iterators that generate an HPCG system matrix, describing a
+		 * \p DIMS -dimensional simulation mesh for Fourier-like heat propagation.
+		 *
+		 * @tparam DIMS dimensions of the mesh
+		 * @tparam CoordType type storing the coordinates and sizes of the matrix
+		 * @tparam ValueType nonzero type
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename ValueType
+		> class HPCGSystemBuilder {
+		public:
+			struct HPCGDiagGenerator {
+
+				HPCGDiagGenerator(
+					ValueType diag,
+					ValueType non_diag
+				) noexcept :
+					_diag( diag ),
+					_non_diag( non_diag ) {}
+
+				HPCGDiagGenerator & operator=( const HPCGDiagGenerator & ) = default;
+
+				inline ValueType operator()(
+					const CoordType & i,
+					const CoordType & j
+				) const noexcept {
+					return j == i ? _diag : _non_diag;
+				}
+
+				ValueType _diag;
+				ValueType _non_diag;
+			};
+
+			using HaloSystemType = grb::utils::multigrid::LinearizedHaloNDimSystem< DIMS, CoordType >;
+			using Iterator = grb::utils::multigrid::HaloMatrixGeneratorIterator< DIMS,
+				CoordType, ValueType, HPCGDiagGenerator >;
+
+			/**
+			 * Construct a new HPCGSystemBuilder object from the data of the physical system.
+			 *
+			 * @param sizes sizes along each dimension
+			 * @param halo halo size
+			 * @param diag value along the diagonal, for self-interactions
+			 * @param non_diag value outside the diagonal, for element-element interaction
+			 */
+			HPCGSystemBuilder(
+				const std::array< CoordType, DIMS > & sizes,
+				CoordType halo,
+				ValueType diag,
+				ValueType non_diag
+			) :
+				_system( sizes, halo ),
+				_diag_generator( diag, non_diag )
+			{
+				if( halo <= 0 ) {
+					throw std::invalid_argument( "halo should be higher than 0" );
+				}
+				for( const auto i : sizes ) {
+					if( i < halo + 1 ) {
+						throw std::invalid_argument( "Iteration halo goes beyond system sizes" );
+					}
+				}
+			}
+
+			HPCGSystemBuilder( const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
+
+			HPCGSystemBuilder( HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
+
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=(
+				const HPCGSystemBuilder< DIMS, CoordType, ValueType > & ) = default;
+
+			HPCGSystemBuilder< DIMS, CoordType, ValueType > & operator=(
+				HPCGSystemBuilder< DIMS, CoordType, ValueType > && ) = default;
+
+			/**
+			 * Number of elements of the mesh.
+			 */
+			size_t system_size() const {
+				return _system.base_system_size();
+			}
+
+			/**
+			 * Total number of neighbors for all elements of the mesh.
+			 */
+			size_t num_neighbors() const {
+				return _system.halo_system_size();
+			}
+
+			/**
+			 * Get the generator object, i.e. the HaloSystemType object that describes the geometry
+			 * of the simulation mesh.
+			 */
+			const HaloSystemType & get_generator() const {
+				return _system;
+			}
+
+			/**
+			 * Builds the beginning iterator to generate the system matrix.
+			 */
+			Iterator make_begin_iterator() const {
+				return Iterator( _system, _diag_generator );
+			}
+
+			/**
+			 * Builds the end iterator to generate the system matrix.
+			 */
+			Iterator make_end_iterator() const {
+				Iterator result( _system, _diag_generator );
+				result += num_neighbors();
+				return result;
+			}
+
+			ValueType get_diag_value() const {
+				return _diag_generator._diag;
+			}
+
+			ValueType get_non_diag_value() const {
+				return _diag_generator._non_diag;
+			}
+
+		private:
+			HaloSystemType _system;
+			HPCGDiagGenerator _diag_generator;
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDER
diff --git a/include/graphblas/algorithms/hpcg/system_building_utils.hpp b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
index 11adf82c1..37e6da311 100644
--- a/include/graphblas/algorithms/hpcg/system_building_utils.hpp
+++ b/include/graphblas/algorithms/hpcg/system_building_utils.hpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,140 +16,430 @@
  */
 
 /**
- * @file hpcg_system_building_utils.hpp
+ * @file system_building_utils.hpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Utilities to build an antire system for HPCG simulations in an arbitrary number of dimensions.
- * @date 2021-04-30
+ * Utilities to build an antire system for HPCG simulations in an arbitrary number of dimensions.
  */
 
-#ifndef _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
-#define _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#ifndef _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
 
+#include <algorithm>
 #include <array>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
+#include <cstdlib>
 #include <memory>
+#include <stdexcept>
+#include <string>
 
 #include <graphblas.hpp>
+#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
+#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
+#include <graphblas/utils/iterators/partition_range.hpp>
 
-#include "hpcg_data.hpp"
-#include "matrix_building_utils.hpp"
-
+#include "average_coarsener.hpp"
+#include "greedy_coloring.hpp"
+#include "single_point_coarsener.hpp"
+#include "system_builder.hpp"
 
 namespace grb {
 	namespace algorithms {
 
 		/**
-		 * @brief Divide each value of \p source by \p step and store the result into \p destination.
+		 * Container of the parameter for HPCG simulation generation: physical system characteristics and
+		 * coarsening information.
+		 *
+		 * @tparam DIMS dimensions of the physical system
+		 * @tparam T type of matrix values
+		 */
+		template<
+			size_t DIMS,
+			typename NonzeroType
+		> struct HPCGSystemParams {
+			std::array< size_t, DIMS > physical_sys_sizes;
+			size_t halo_size;
+			NonzeroType diag_value;
+			NonzeroType non_diag_value;
+			size_t min_phys_size;
+			size_t max_levels;
+			size_t coarsening_step;
+		};
+
+		/**
+		 * Builds all required system generators for an entire multi-grid simulation; each generator
+		 * corresponds to a level of the HPCG system multi-grid, with increasingly coarser sizes, and can
+		 * generate the system matrix of that level. All required pieces of information required to build
+		 * the levels is stored in \p params.
 		 *
-		 * @tparam DIMS size of passed arrays
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam NonzeroType type of the nonzero
+		 * @param[in] params structure with the parameters to build an entire HPCG simulation
+		 * @param[out] mg_generators std::vector of HPCGSystemBuilder, one per layer of the multi-grid
 		 */
-		template< std::size_t DIMS >
-		void divide_array( std::array< std::size_t, DIMS > & destination, const std::array< std::size_t, DIMS > & source, std::size_t step ) {
-			for( std::size_t i { 0 }; i < destination.size(); i++ ) {
-				destination[ i ] = source[ i ] / step;
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename NonzeroType
+		> void hpcg_build_multigrid_generators(
+			const HPCGSystemParams< DIMS, NonzeroType > & params,
+			std::vector< grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > > & mg_generators
+		) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
+
+			size_t const current_size = std::accumulate( params.physical_sys_sizes.cbegin(),
+				params.physical_sys_sizes.cend(), 1UL, std::multiplies< size_t >() );
+			if( current_size > std::numeric_limits< CoordType >::max() ) {
+				throw std::domain_error( "CoordType cannot store the matrix coordinates" );
+			}
+			size_t min_physical_size = *std::min_element( params.physical_sys_sizes.cbegin(),
+				params.physical_sys_sizes.cend() );
+			if( min_physical_size < params.min_phys_size ) {
+				throw std::domain_error( "the initial system is too small" );
+			}
+
+			std::array< CoordType, DIMS > coord_sizes;
+			// type-translate coordinates
+			std::copy( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(),
+				coord_sizes.begin() );
+
+			// generate hierarchical coarseners
+			for( size_t coarsening_level = 0UL; min_physical_size >= params.min_phys_size
+				&& coarsening_level <= params.max_levels; coarsening_level++ ) {
+
+				// build generator
+				mg_generators.emplace_back( coord_sizes, params.halo_size, params.diag_value,
+					params.non_diag_value );
+
+				// prepare for new iteration
+				min_physical_size /= params.coarsening_step;
+				std::for_each( coord_sizes.begin(), coord_sizes.end(), [ &params ]( CoordType & v ) {
+					std::ldiv_t ratio = std::ldiv( v, params.coarsening_step );
+					if( ratio.rem != 0 ) {
+						throw std::invalid_argument( std::string( "system size " ) + std::to_string( v )
+							+ std::string( " is not divisible by " ) + std::to_string( params.coarsening_step ) );
+					}
+					v = ratio.quot;
+				} );
 			}
 		}
 
 		/**
-		 * @brief Container of the parameter for HPCG simulation generation: physical system characteristics and
-		 * coarsening information.
+		 * Populates the system matrix \p M out of the builder \p system_generator.
 		 *
-		 * @tparam DIMS dimensions of the physical system
-		 * @tparam T type of matrix values
+		 * The matrix \p M must have been previously allocated and initialized with the proper sizes,
+		 * as this procedure only populates it with the nozeroes generated by \p system_generator.
+		 *
+		 * This function takes care of the parallelism by employing random-access iterators and by
+		 * \b parallelizing the generation across multiple processes in case of distributed execution.
 		 */
-		template< std::size_t DIMS, typename T >
-		struct hpcg_system_params {
-			const std::array< std::size_t, DIMS > & physical_sys_sizes;
-			const std::size_t halo_size;
-			const std::size_t num_colors;
-			const T diag_value;
-			const T non_diag_value;
-			const std::size_t min_phys_size;
-			const std::size_t max_levels;
-			const std::size_t coarsening_step;
-		};
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename NonzeroType,
+			typename Logger
+		> grb::RC hpcg_populate_system_matrix(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & system_generator,
+			grb::Matrix< NonzeroType > & M, Logger & logger
+		) {
+			logger << "- generating system matrix...";
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator
+				begin( system_generator.make_begin_iterator() );
+			typename grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType >::Iterator
+				end( system_generator.make_end_iterator() );
+			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(),
+				system_generator.num_neighbors(), begin, end );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
+		}
 
 		/**
-		 * @brief Generates an entire HPCG problem according to the parameters in \p params , storing it in \p holder .
+		 * Populates the coarsening data \p coarsener (in particular the coarsening matrix) from the
+		 * builder of the finer system \p finer_system_generator and that of the coarser system
+		 * \p coarser_system_generator.
 		 *
-		 * @tparam DIMS dimensions of the system
-		 * @tparam T type of matrix values
-		 * @param holder std::unique_ptr to store the HPCG problem into
-		 * @param params parameters container to build the HPCG problem
-		 * @return grb::SUCCESS if every GraphBLAS operation (to generate vectors and matrices) succeeded,
-		 * otherwise the first unsuccessful return value
+		 * This function takes care of parallelizing the generation by using a random-access iterator
+		 * to generate the coarsening matrix and by distributing the generation across nodes
+		 * of a distributed system (if any).
+		 * @tparam IterBuilderType type of the matrix builder, either SinglePointCoarsenerBuilder
+		 *  or AverageCoarsenerBuilder
+		 * @tparam DIMS number of dimensions
+		 * @tparam CoordType type storing the coordinates and the sizes
+		 * @tparam NonzeroType type of the nonzero
+		 *
+		 * @param finer_system_generator object generating the finer system
+		 * @param coarser_system_generator object generating the finer system
+		 * @param coarsener structure with the matrix to populate
 		 */
-		template< std::size_t DIMS, typename T = double >
-		grb::RC build_hpcg_system( std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > > & holder, hpcg_system_params< DIMS, T > & params ) {
-			// n is the system matrix size
-			const std::size_t n { std::accumulate( params.physical_sys_sizes.cbegin(), params.physical_sys_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
+		template<
+			typename IterBuilderType,
+			size_t DIMS,
+			typename CoordType,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_coarsener_any_builder(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
+		) {
+			static_assert( DIMS > 0, "DIMS must be > 0" );
 
-			grb::algorithms::hpcg_data< T, T, T > * data { new grb::algorithms::hpcg_data< T, T, T >( n ) };
+			const std::array< CoordType, DIMS > & finer_sizes = finer_system_generator.get_generator().get_sizes();
+			const std::array< CoordType, DIMS > & coarser_sizes = coarser_system_generator.get_generator().get_sizes();
+			const size_t finer_size = finer_system_generator.system_size();
+			const size_t coarser_size = coarser_system_generator.system_size();
 
-			assert( ! holder ); // should be empty
-			holder = std::unique_ptr< grb::algorithms::hpcg_data< T, T, T > >( data );
+			if( coarser_size >= finer_size ) {
+				throw std::invalid_argument( "wrong sizes" );
+			}
 
-			// initialize the main (=uncoarsened) system matrix
-			grb::RC rc { grb::SUCCESS };
-			rc = build_ndims_system_matrix< DIMS, T >( data->A, params.physical_sys_sizes, params.halo_size, params.diag_value, params.non_diag_value );
+			size_t const rows = coarser_size;
+			size_t const cols = finer_size;
 
-			if( rc != grb::SUCCESS ) {
-				std::cerr << "Failure to generate the initial system (" << toString( rc ) << ") of size " << n << std::endl;
-				return rc;
+			assert( finer_sizes.size() == coarser_sizes.size() );
+
+			grb::Matrix< NonzeroType > & M = coarsener.coarsening_matrix;
+			if( grb::nrows( M ) != rows || grb::ncols( M ) != cols ) {
+				throw std::invalid_argument( "wrong matrix dimensions: matrix should be rectangular"
+											 " with rows == <coarser size> and cols == <finer size>" );
 			}
 
-			// set values of diagonal vector
-			set( data->A_diagonal, params.diag_value );
-
-			build_static_color_masks( data->color_masks, n, params.num_colors );
-
-			// initialize coarsening with additional pointers and dimensions copies to iterate and divide
-			grb::algorithms::multi_grid_data< T, T > ** coarser = &data->coarser_level;
-			assert( *coarser == nullptr );
-			std::array< std::size_t, DIMS > coarser_sizes;
-			std::array< std::size_t, DIMS > previous_sizes( params.physical_sys_sizes );
-			std::size_t min_physical_coarsened_size { *std::min_element( previous_sizes.cbegin(), previous_sizes.cend() ) / params.coarsening_step };
-			// coarsen system sizes into coarser_sizes
-			divide_array( coarser_sizes, previous_sizes, params.coarsening_step );
-			std::size_t coarsening_level = 0UL;
-
-			// generate linked list of hierarchical coarseners
-			while( min_physical_coarsened_size >= params.min_phys_size && coarsening_level < params.max_levels ) {
-				assert( *coarser == nullptr );
-				// compute size of finer and coarser matrices
-				std::size_t coarser_size { std::accumulate( coarser_sizes.cbegin(), coarser_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-				std::size_t previous_size { std::accumulate( previous_sizes.cbegin(), previous_sizes.cend(), 1UL, std::multiplies< std::size_t >() ) };
-				// build data structures for new level
-				grb::algorithms::multi_grid_data< double, double > * new_coarser { new grb::algorithms::multi_grid_data< double, double >( coarser_size, previous_size ) };
-				// install coarser level immediately to cleanup in case of build error
-				*coarser = new_coarser;
-				// initialize coarsener matrix, system matrix and diagonal vector for the coarser level
-				rc = build_ndims_coarsener_matrix< DIMS >( new_coarser->coarsening_matrix, coarser_sizes, previous_sizes );
-				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate coarsening matrix (" << toString( rc ) << ")." << std::endl;
-					return rc;
+			IterBuilderType coarsener_builder( finer_sizes, coarser_sizes );
+			typename IterBuilderType::Iterator begin( coarsener_builder.make_begin_iterator() ), end( coarsener_builder.make_end_iterator() );
+			grb::utils::partition_iteration_range_on_procs( spmd<>::nprocs(), spmd<>::pid(), coarsener_builder.system_size(), begin, end );
+			return buildMatrixUnique( M, begin, end, grb::IOMode::PARALLEL );
+		}
+
+		/**
+		 * Populates a coarsener that samples one element every \a 2^DIMS .
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_coarsener(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
+		) {
+			return hpcg_populate_coarsener_any_builder<
+				grb::algorithms::SinglePointCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
+					finer_system_generator, coarser_system_generator, coarsener );
+		}
+
+		/**
+		 * Populates a coarsener that averages over \a 2^DIMS elements.
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename IOType,
+			typename NonzeroType
+		> grb::RC hpcg_populate_coarsener_avg(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & finer_system_generator,
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & coarser_system_generator,
+			CoarseningData< IOType, NonzeroType > & coarsener
+		) {
+			return hpcg_populate_coarsener_any_builder<
+			grb::algorithms::hpcg::AverageCoarsenerBuilder< DIMS, CoordType, NonzeroType > >(
+				finer_system_generator, coarser_system_generator, coarsener );
+		}
+
+		namespace internal {
+
+			/**
+			 * Store row values based on their color into separate vectors.
+			 *
+			 * @param[in] row_colors for each row (corresponding to a vector position) its color
+			 * @param[in] num_colors number of colors, i.e. max across all values in \p row_colors + 1
+			 * @param[out] per_color_rows for each position \a i it stores an std::vector with all rows
+			 *  of color \a i inside \p row_colors
+			 */
+			template< typename CoordType > void hpcg_split_rows_by_color(
+				const std::vector< CoordType > & row_colors,
+				size_t num_colors, std::vector< std::vector< CoordType > > & per_color_rows
+			) {
+				per_color_rows.resize( num_colors );
+				for( CoordType i = 0; i < row_colors.size(); i++ ) {
+					per_color_rows[ row_colors[ i ] ].push_back( i );
 				}
-				rc = build_ndims_system_matrix< DIMS, T >( new_coarser->A, coarser_sizes, params.halo_size, params.diag_value, params.non_diag_value );
-				if( rc != grb::SUCCESS ) {
-					std::cerr << "Failure to generate system matrix (" << toString( rc ) << ")for size " << coarser_size << std::endl;
-					return rc;
+			}
+
+			/**
+			 * Utility class implementing a random-access iterator that always returns a
+			 * \c true value.
+			 *
+			 * It is used in the following to build mask vectors via buildVectorUnique(), where
+			 * all the non-zero positions are \c true.
+			 *
+			 * @tparam CoordType type of the internal coordinate
+			 */
+			template< typename CoordType > struct true_iter {
+
+				using self_t = true_iter< CoordType >;
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = bool;
+				using pointer = const bool *;
+				using reference = const bool &;
+				using difference_type = long;
+
+				true_iter() = delete;
+
+				true_iter( CoordType first ) : index( first ) {}
+
+				true_iter( const self_t & ) = default;
+
+				self_t & operator=( const self_t & ) = default;
+
+				bool operator!=( const self_t & other ) const {
+					return this->index != other.index;
 				}
-				set( new_coarser->A_diagonal, params.diag_value );
-				// build color masks for coarser level (same masks, but with coarser system size)
-				rc = build_static_color_masks( new_coarser->color_masks, coarser_size, params.num_colors );
 
-				// prepare for new iteration
-				coarser = &new_coarser->coarser_level;
-				min_physical_coarsened_size /= params.coarsening_step;
-				previous_sizes = coarser_sizes;
-				divide_array( coarser_sizes, coarser_sizes, params.coarsening_step );
-				coarsening_level++;
+				self_t & operator++() noexcept {
+					(void)index++;
+					return *this;
+				}
+
+				self_t & operator+=( size_t increment ) noexcept {
+					index += increment;
+					return *this;
+				}
+
+				difference_type operator-( const self_t & other ) noexcept {
+					return static_cast< difference_type >( this->index - other.index );
+				}
+
+				pointer operator->() const {
+					return &__TRUE;
+				}
+
+				reference operator*() const {
+					return *( this->operator->() );
+				}
+
+			private:
+				CoordType index;
+				const bool __TRUE = true; // for its address to be passed outside
+			};
+
+			/**
+			 * Populates \p masks with static color mask generated for a squared matrix of size \p matrix_size .
+			 *
+			 * Colors are built in the range [0, \p colors ), with the mask for color 0 being the array
+			 * of values true in the positions \f$ [0, colors, 2*colors, ..., floor((system_size - 1)/colors) * color] \f$,
+			 * for color 1 in the positions \f$ [1, 1+colors, 1+2*colors, ..., floor((system_size - 2)/colors) * color] \f$,
+			 * etc.; the mask for color 0 is in \c masks[0], for color 1 in \c masks[1] and so on.
+			 *
+			 * The vectors stored in \p masks (assumed empty at the beginning) are built inside the function and populated
+			 * only with the \c true values, leading to sparse vectors. This saves on storage space and allows
+			 * GraphBLAS routines (like \c eWiseLambda() ) to iterate only on true values.
+			 *
+			 * @param masks output vector of color masks
+			 * @param matrix_size size of the system matrix
+			 * @param colors numbers of colors masks to build; it must be < \p matrix_size
+			 * @return grb::RC the success value returned when trying to build the vector
+			 */
+			grb::RC hpcg_build_static_color_masks(
+				size_t matrix_size,
+				const std::vector< std::vector< size_t > > & per_color_rows,
+				std::vector< grb::Vector< bool > > & masks
+			) {
+				if( ! masks.empty() ) {
+					throw std::invalid_argument( "vector of masks is expected to be empty" );
+				}
+				for( size_t i = 0; i < per_color_rows.size(); i++ ) {
+					const std::vector< size_t > & rows = per_color_rows[ i ];
+#ifdef _DEBUG
+					{
+						std::cout << "\ncolor " << i << std::endl;
+						for( size_t row : rows ) {
+							std::cout << row << " ";
+						}
+						std::cout << std::endl;
+					}
+#endif
+					masks.emplace_back( matrix_size );
+					grb::Vector< bool > & output_mask = masks.back();
+					std::vector< size_t >::const_iterator begin = rows.cbegin();
+					std::vector< size_t >::const_iterator end = rows.cend();
+					// partition_iteration_range( rows.size(), begin, end );
+					grb::RC rc = grb::buildVectorUnique( output_mask, begin, end,
+						true_iter< size_t >( 0 ), true_iter< size_t >( rows.size() ), IOMode::SEQUENTIAL );
+					if( rc != SUCCESS ) {
+						std::cerr << "error while creating output mask for color " << i << ": " << toString( rc ) << std::endl;
+						return rc;
+					}
+#ifdef _DEBUG
+					{
+						std::cout << "mask color " << i << std::endl;
+						size_t count = 0;
+						for( const auto & v : output_mask ) {
+							std::cout << v.first << " ";
+							count++;
+							if( count > 20 )
+								break;
+						}
+						std::cout << std::endl;
+					}
+#endif
+				}
+				return grb::SUCCESS;
+			}
+
+		} // namespace internal
+
+		/**
+		 * Populates the smoothing information \p smoothing_info for a Red-Black Gauss-Seidel smoother
+		 * to be used for an HPCG simulation. The information about the mesh to smooth are passed
+		 * via \p system_generator.
+		 *
+		 * Steps for the smoother generation:
+		 *
+		 * 1. the mesh elements (the system matrix rows) are colored via a greedy algorithm, so that
+		 *  no two neighboring elements have the same color; this phase colors the \b entire system
+		 *  and cannot be parallelized, even in a distributed system, since the current coloring algorithm
+		 *  is \b not distributed
+		 * 2. rows are split according to their color
+		 * 3. for each color \a c the color mask with the corresponding rows is generated:
+		 *  a dedicated sparse grb::Vector<bool> signals the rows of color \a c (by marking them as \c true
+		 *  ); such a vector allows updating all rows of color \a c in \b parallel when used as a mask
+		 *  to an mxv() operation (as done during smoothing)
+		 */
+		template<
+			size_t DIMS,
+			typename CoordType,
+			typename NonzeroType,
+			typename Logger
+		> grb::RC hpcg_populate_smoothing_data(
+			const grb::algorithms::HPCGSystemBuilder< DIMS, CoordType, NonzeroType > & system_generator,
+			SmootherData< NonzeroType > & smoothing_info, Logger & logger
+		) {
+			grb::RC rc = set( smoothing_info.A_diagonal, system_generator.get_diag_value() );
+			if( rc != grb::SUCCESS ) {
+				logger << "error: " << __LINE__ << std::endl;
+				return rc;
+			}
+
+			logger << "- running coloring heuristics...";
+			std::vector< CoordType > colors, color_counters;
+			hpcg_greedy_color_ndim_system( system_generator.get_generator(), colors, color_counters );
+			std::vector< std::vector< CoordType > > per_color_rows;
+			internal::hpcg_split_rows_by_color( colors, color_counters.size(), per_color_rows );
+			colors.clear();
+			colors.shrink_to_fit();
+			if( rc != grb::SUCCESS ) {
+				logger << "error: " << __LINE__ << std::endl;
+				return rc;
 			}
-			return rc;
+			logger << "- found " << color_counters.size() << " colors,"
+				   << " generating color masks...";
+			return internal::hpcg_build_static_color_masks( system_generator.system_size(),
+				per_color_rows, smoothing_info.color_masks );
 		}
 
 	} // namespace algorithms
 } // namespace grb
 
-#endif // _H_GRB_ALGORITHMS_SYSTEM_BUILDING_UTILS
+#endif // _H_GRB_ALGORITHMS_HPCG_SYSTEM_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
new file mode 100644
index 000000000..9c95b50cc
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_building_utils.hpp
@@ -0,0 +1,101 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file multigrid_building_utils.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Utilities to allocate data for an entire multi-grid simulation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+#define _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Allocates all the levels for an entire multi-grid simulation for the multi-grid v-cycle,
+		 * the coarsener and the smoother. This routine just allocates and initializes the data structures,
+		 * but does \b not populate them, which depends on the specific algorithms.
+		 *
+		 * Thanks to the templating, this routine is meant to be independent from the specific algorithm
+		 * choosen for the simulation, but simply implements the logic to move from one level (finer)
+		 * to the next one (coarser). To be used with any data structure, the constructor of each
+		 * structure must meet a certain interface, as explained in the following.
+		 *
+		 * Note: structures are allocated on the heap and manged via an std::unique_ptr for efficiency
+		 * and convenience: since they may store large data amounts, moving them via their move (copy)
+		 * constructor (as required for the growth of an std::vector) may be costly, and forces the user
+		 * to implement the move constructor for each type (which may be annoying).
+		 * Furthermore, avoiding movement (copy) entirely protects against possible bugs
+		 * in move (copy)-constructor logic (not uncommon in prototypes).
+		 *
+		 * @tparam MGInfoType type holding the information to run the chosen multi-grid algorithm:
+		 * 	its constructor must take in input the coarsening level (0 to \p mg_sizes.size() )
+		 *  and the size of the system matrix for that level
+		 * @tparam CoarsenerInfoType type holding the information for the coarsener;
+		 *  its constructor must take in input the size of the finer system matrix and that of
+		 *  the coarser system matrix (in this order)
+		 * @tparam SmootherInfoType type holding the information for the smoother;
+		 *  its constructor must take in input the size of the system matrix for that level
+		 * @tparam TelControllerType telemetry controller type, to (de)activate time measurement at compile-time
+		 *
+		 * @param mg_sizes sizes of the system matrix for each level of the multi-grid
+		 * @param system_levels system data (system matrix, residual, solution, ...) for each level
+		 * @param coarsener_levels at position \a i of this vector, data to coarsen from level \a i
+		 *  (system size \p mg_sizes [i] ) to level \a i+1 (system size \p mg_sizes [i+1] )
+		 * @param smoother_levels smoother data for each level
+		 * @param tt telemetry controller to control time tracing
+		 */
+		template<
+			typename MGInfoType,
+			typename CoarsenerInfoType,
+			typename SmootherInfoType,
+			typename TelControllerType
+		> void multigrid_allocate_data(
+			std::vector< std::unique_ptr< MGInfoType > > & system_levels,
+			std::vector< std::unique_ptr< CoarsenerInfoType > > & coarsener_levels,
+			std::vector< std::unique_ptr< SmootherInfoType > > & smoother_levels,
+			const std::vector< size_t > & mg_sizes,
+			const TelControllerType & tt
+		) {
+			if( mg_sizes.size() == 0 ) {
+				throw std::invalid_argument( "at least one size should be available" );
+			}
+			size_t finer_size = mg_sizes[ 0 ];
+			system_levels.emplace_back( new MGInfoType( tt, 0, finer_size ) );  // create main system
+			smoother_levels.emplace_back( new SmootherInfoType( finer_size ) ); // create smoother for main
+			for( size_t i = 1; i < mg_sizes.size(); i++ ) {
+				size_t coarser_size = mg_sizes[ i ];
+				if( coarser_size >= finer_size ) {
+					throw std::invalid_argument( "system sizes not monotonically decreasing" );
+				}
+				coarsener_levels.emplace_back( new CoarsenerInfoType( finer_size, coarser_size ) );
+				system_levels.emplace_back( new MGInfoType( tt, i, coarser_size ) );
+				smoother_levels.emplace_back( new SmootherInfoType( coarser_size ) );
+				finer_size = coarser_size;
+			}
+		}
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_BUILDING_UTILS
diff --git a/include/graphblas/algorithms/multigrid/multigrid_cg.hpp b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
new file mode 100644
index 000000000..5fa1a3772
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_cg.hpp
@@ -0,0 +1,343 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/algorithms/mutligrid
+ * This folder contains the implementation of the algorithms for a basic multi-grid V-cycle solver:
+ * Conjugate Gradient with multi-grid, a basic V-cycle multi-grid implementation, a single-matrix coarsener/
+ * prolonger, an implementation of a Red-Black Gauss-Seidel smoother. These algorithms can be composed
+ * via their specific runners, as in the example HPCG benchmark.
+ */
+
+/**
+ * @file multigrid_cg.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Algorithm and runner for a Conjugate Gradient solver augmented with a multi-grid solver.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_CG
+#define _H_GRB_ALGORITHMS_MULTIGRID_CG
+
+#include <type_traits>
+#include <utility>
+
+#include <graphblas.hpp>
+#include <graphblas/utils/telemetry/OutputStream.hpp>
+#include <graphblas/utils/telemetry/Timeable.hpp>
+
+#include "multigrid_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Data stucture to store the vectors specific to the Conjugate Gradient algorithm,
+		 * including inputs, outputs and temporary vectors.
+		 *
+		 * Input and output vectors use the same naming scheme as for the corresponding mathematics,
+		 * where the equation to solve is conventionally written as \f$ A x = b \f$.
+		 *
+		 * @tparam IOType type of values of the vectors for intermediate results
+		 * @tparam NonzeroType type of the values stored inside the system matrix #A
+		 * @tparam InputType type of the values of the right-hand side vector #b
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename InputType
+		> struct MultiGridCGData {
+
+			grb::Vector< InputType > b; ///< Right-side vector of known values.
+			grb::Vector< IOType > u;    ///< temporary vectors (typically for CG exploration directions)
+			grb::Vector< IOType > p;    ///< temporary vector (typically for x refinements coming from the multi-grid run)
+			grb::Vector< IOType > x;    ///< system solution being refined over the iterations: it us up to the user
+			///< to set the initial solution value to something meaningful
+
+			/**
+			 * Construct a new \c MultiGridCGData object by building its vectors with size \p sys_size.
+			 */
+			MultiGridCGData( size_t sys_size ) :
+				b( sys_size ),
+				u( sys_size ),
+				p( sys_size ),
+				x( sys_size ) {}
+
+			grb::RC init_vectors( IOType zero ) {
+				grb::RC rc = grb::set( u, zero );
+				rc = rc ? rc : grb::set( p, zero );
+				return rc;
+			}
+		};
+
+		/**
+		 * Structure for the output information of a CG run.
+		 */
+		template< typename ResidualType > struct CGOutInfo {
+			size_t iterations;          ///< number of iterations performed
+			ResidualType norm_residual; ///< norm of the final residual
+		};
+
+		/**
+		 * Runner object incapsulating all information to run a Conjugate Gradient solver
+		 * with multi-grid.
+		 *
+		 * The multi-grid runner must be constructed separately (depending on the chosen algorithm)
+		 * and move-transfered during construction of this runner.
+		 * The \p MultiGridrunnerType must implement a functional interface whose input (from CG)
+		 * is the structure with the system information for one level of the grid.
+		 *
+		 * @tparam MGCGTypes types container for algebraic information (IOType, NonzeroType,
+		 * 	InputType, ResidualType, Ring, Minus)
+		 * @tparam MultiGridrunnerType type for the multi-grid runner object
+		 * @tparam descr descriptors with statically-known data for computation and containers
+		 * @tparam DbgOutputStreamType type for the debugging stream, i.e. the stream to trace simulation
+		 * 	results alongside execution; the default type #grb::utils::telemetry::OutputStreamOff disables
+		 * 	all output at compile time
+		 */
+		template<
+			typename MGCGTypes,
+			typename MultiGridRunnerType,
+			typename TelControllerType,
+			Descriptor descr = descriptors::no_operation,
+			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
+		> struct MultiGridCGRunner : public grb::utils::telemetry::Timeable< TelControllerType > {
+
+			// algebraic types
+			using IOType = typename MGCGTypes::IOType;
+			using NonzeroType = typename MGCGTypes::NonzeroType;
+			using InputType = typename MGCGTypes::InputType;
+			using ResidualType = typename MGCGTypes::ResidualType;
+			using Ring = typename MGCGTypes::Ring;
+			using Minus = typename MGCGTypes::Minus;
+			// input types for simulation (CG and MG)
+			using HPCGInputType = MultiGridCGData< IOType, NonzeroType, InputType >;
+			using MGRunnerType = MultiGridRunnerType;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+			static_assert( std::is_move_constructible< MultiGridRunnerType >::value,
+				"cannot construct the Multi-Grid runner by move" );
+
+			Ring ring; ///< algebraic ring to be used
+			Minus minus; ///< minus operator to be used
+			bool with_preconditioning = true; ///<  whether preconditioning is enabled
+			size_t max_iterations = 10; ///< max number of allowed iterations for CG:
+			///< after that, the solver is halted and the result achieved so far returned
+			ResidualType tolerance = ring.template getZero< ResidualType >(); ///< ratio
+			///< between initial residual and current residual that halts the solver
+			///< if reached, for the solution is to be considered "good enough"
+			MultiGridRunnerType & mg_runner; ///< runner object for MG
+			DbgOutputStreamType dbg_logger; ///< logger to trace execution
+
+			/**
+			 * Construct a new MultiGridCGRunner object with the required MG runner.
+			 *
+			 * The debug logger is unavailable.
+			 */
+			MultiGridCGRunner(
+				const TelControllerType & tt,
+				MultiGridRunnerType & _mg_runner
+			) :
+				grb::utils::telemetry::Timeable< TelControllerType >( tt ),
+				mg_runner( _mg_runner ),
+				dbg_logger()
+			{
+				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
+			}
+
+			/**
+			 * Construct a new MultiGridCGRunner object with the required MG runner and
+			 * 	the user-given debug logger.
+			 */
+			MultiGridCGRunner(
+				const TelControllerType & tt,
+				MultiGridRunnerType & _mg_runner,
+				DbgOutputStreamType & _dbg_logger
+			) :
+				grb::utils::telemetry::Timeable< TelControllerType >( tt ),
+				mg_runner( _mg_runner ),
+				dbg_logger( _dbg_logger ) {}
+
+			/**
+			 * Functional operator to invoke a full CG-MG computation.
+			 *
+			 * @param grid_base base level of the grid
+			 * @param cg_data data for CG
+			 * @param out_info output information from CG
+			 * @return grb::RC indicating the success or the error occurred
+			 */
+			inline grb::RC operator()(
+				typename MultiGridRunnerType::MultiGridInputType & grid_base,
+				MultiGridCGData< IOType, NonzeroType, InputType > & cg_data,
+				CGOutInfo< ResidualType > & out_info
+			) {
+				this->start();
+				grb::RC ret = multigrid_conjugate_gradient( cg_data, grid_base, out_info );
+				this->stop();
+				return ret;
+			}
+
+			/**
+			 * Conjugate Gradient algorithm implementation augmented by a Multi-Grid solver,
+			 * inspired to the High Performance Conjugate Gradient benchmark.
+			 *
+			 * This CG solver calls the MG solver at the beginning of each iteration to improve
+			 * the initial solution via the residual (thanks to the smoother) and then proceeds with
+			 * the standard CG iteration.
+			 *
+			 * Failures of GraphBLAS operations are handled by immediately stopping the execution and by returning
+			 * the failure code.
+			 *
+			 * @param cg_data data for the CG solver only
+			 * @param grid_base base (i.e., finer) level of the multi-grid, with the information of the physical system
+			 * @param out_info solver output information
+			 * @return grb::RC SUCCESS in case of succesful run
+			 */
+			grb::RC multigrid_conjugate_gradient(
+				HPCGInputType & cg_data,
+				typename MultiGridRunnerType::MultiGridInputType & grid_base,
+				CGOutInfo< ResidualType > & out_info
+			) {
+				const grb::Matrix< NonzeroType > & A = grid_base.A; // system matrix
+				grb::Vector< IOType > & r = grid_base.r;            // residual vector
+				grb::Vector< IOType > & z = grid_base.z;            // pre-conditioned residual vector
+				grb::Vector< IOType > & x = cg_data.x;              // initial (and final) solution
+				const grb::Vector< InputType > & b = cg_data.b;     // right-side value
+				grb::Vector< IOType > & p = cg_data.p;              // direction vector
+				grb::Vector< IOType > & Ap = cg_data.u;             // temp vector
+				grb::RC ret = SUCCESS;
+
+				const IOType io_zero = ring.template getZero< IOType >();
+				ret = ret ? ret : grb::set( Ap, io_zero );
+				ret = ret ? ret : grb::set( r, io_zero );
+				ret = ret ? ret : grb::set( p, io_zero );
+
+				ret = ret ? ret : grb::set( p, x );
+				// Ap = A * x
+				ret = ret ? ret : grb::mxv< descr >( Ap, A, x, ring );
+				assert( ret == SUCCESS );
+				// r = b - Ap
+				ret = ret ? ret : grb::eWiseApply< descr >( r, b, Ap, minus );
+				assert( ret == SUCCESS );
+
+				const ResidualType residual_zero = ring.template getZero< ResidualType >();
+				ResidualType norm_residual = residual_zero;
+				// norm_residual = r' * r
+				ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, ring );
+				assert( ret == SUCCESS );
+
+				// compute sqrt to avoid underflow
+				norm_residual = std::sqrt( norm_residual );
+
+				// initial norm of residual
+				out_info.norm_residual = norm_residual;
+				const ResidualType norm_residual_initial = norm_residual;
+				ResidualType old_r_dot_z = residual_zero, r_dot_z = residual_zero, beta = residual_zero;
+				size_t iter = 0;
+
+				dbg_logger << ">>> start p: " << p << std::endl;
+				dbg_logger << ">>> start Ap: " << Ap << std::endl;
+				dbg_logger << ">>> start r: " << r << std::endl;
+
+				do {
+					dbg_logger << "========= iteration " << iter << " =========" << std::endl;
+
+					if( with_preconditioning ) {
+						ret = ret ? ret : mg_runner( grid_base );
+						assert( ret == SUCCESS );
+					} else {
+						// z = r
+						ret = ret ? ret : grb::set( z, r );
+						assert( ret == SUCCESS );
+					}
+					dbg_logger << ">>> initial z: " << z << std::endl;
+
+					if( iter == 0 ) {
+						//  p = z
+						ret = ret ? ret : grb::set< descr >( p, z );
+						assert( ret == SUCCESS );
+						// r_dot_z = r' * z
+						ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, ring );
+						assert( ret == SUCCESS );
+					} else {
+						old_r_dot_z = r_dot_z;
+						// r_dot_z = r' * z
+						r_dot_z = ring.template getZero< ResidualType >();
+						ret = ret ? ret : grb::dot< descr >( r_dot_z, r, z, ring );
+						assert( ret == SUCCESS );
+
+						beta = r_dot_z / old_r_dot_z;
+						// Ap  = 0
+						ret = ret ? ret : grb::set< descr >( Ap, io_zero );
+						assert( ret == SUCCESS );
+						// Ap += beta * p
+						ret = ret ? ret : grb::eWiseMul< descr >( Ap, beta, p, ring );
+						assert( ret == SUCCESS );
+						// Ap = Ap + z
+						ret = ret ? ret : grb::eWiseApply< descr >( Ap, Ap, z, ring.getAdditiveOperator() );
+						assert( ret == SUCCESS );
+						// p = Ap
+						std::swap( Ap, p );
+						assert( ret == SUCCESS );
+					}
+					dbg_logger << ">>> middle p: " << p << std::endl;
+
+					// Ap = A * p
+					ret = ret ? ret : grb::set< descr >( Ap, io_zero );
+					ret = ret ? ret : grb::mxv< descr >( Ap, A, p, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> middle Ap: " << Ap << std::endl;
+
+					// pAp = p' * Ap
+					ResidualType pAp = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot< descr >( pAp, Ap, p, ring );
+					assert( ret == SUCCESS );
+
+					ResidualType alpha = r_dot_z / pAp;
+					// x += alpha * p
+					ret = ret ? ret : grb::eWiseMul< descr >( x, alpha, p, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> end x: " << x << std::endl;
+
+					// r += - alpha * Ap
+					ret = ret ? ret : grb::eWiseMul< descr >( r, -alpha, Ap, ring );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> end r: " << r << std::endl;
+
+					// residual = r' * r
+					norm_residual = ring.template getZero< ResidualType >();
+					ret = ret ? ret : grb::dot< descr >( norm_residual, r, r, ring );
+					assert( ret == SUCCESS );
+
+					norm_residual = std::sqrt( norm_residual );
+
+					++iter;
+					out_info.iterations = iter;
+					out_info.norm_residual = norm_residual;
+				} while( iter < max_iterations && norm_residual / norm_residual_initial > tolerance
+					&& ret == SUCCESS );
+
+				return ret;
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_CG
diff --git a/include/graphblas/algorithms/multigrid/multigrid_data.hpp b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
new file mode 100644
index 000000000..a0a76191e
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_data.hpp
@@ -0,0 +1,101 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file multigrid_data.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Data structure definition to store the information of a single multi-grid level.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_DATA
+#define _H_GRB_ALGORITHMS_HPCG_DATA
+
+#include <cstddef>
+#include <vector>
+
+#include <graphblas.hpp>
+#include <graphblas/utils/telemetry/Stopwatch.hpp>
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * This data structure stores information for a \b single multi-grid level. This information
+		 * dependes exclusively on the size of the underlying physical system.
+
+		 *
+		 * Internal ALP/GraphBLAS containers are initialized to the proper size,
+		 * but their values are \b not initialized as this depends on the specific algorithm chosen
+		 * for the multi-grid solver. Populating them is user's task.
+		 *
+		 * @tparam IOType Type of values of the vectors for intermediate results
+		 * @tparam NonzeroType Type of the values stored inside the system matrix \p A
+		 *                     and the coarsening matrix #Ax_finer
+		 * @tparam TelControllerType type of the controller for telemetry, to compile-time (de)activate
+		 * 	the (mg_sm)_stopwatches
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType,
+			typename TelControllerType
+		> struct MultiGridData {
+
+			grb::utils::telemetry::Stopwatch< TelControllerType > mg_stopwatch; ///< stopwatch
+			///< to measure the execution time in MG
+			grb::utils::telemetry::Stopwatch< TelControllerType > sm_stopwatch; ///< stopwatch
+			///< to measure the execution time in the smoother
+			const size_t level; ///< level of the grid (0 for the finest physical system)
+			const size_t system_size; ///< size of the system, i.e. side of the #A system matrix
+			grb::Matrix< NonzeroType > A; ///< system matrix
+			grb::Vector< IOType > z; ///< multi-grid solution
+			grb::Vector< IOType > r; ///< residual
+
+			/**
+			 * Construct a new multigrid data object from level information and system size.
+			 */
+			MultiGridData(
+				const TelControllerType & _tt,
+				size_t _level,
+				size_t sys_size
+			) :
+				mg_stopwatch( _tt ),
+				sm_stopwatch( _tt ),
+				level( _level ),
+				system_size( sys_size ),
+				A( sys_size, sys_size ),
+				z( sys_size ),
+				r( sys_size ) {}
+
+			// for safety, disable copy semantics
+			MultiGridData( const MultiGridData< IOType, NonzeroType, TelControllerType > & o ) = delete;
+
+			MultiGridData< IOType, NonzeroType, TelControllerType > & operator=(
+					const MultiGridData< IOType, NonzeroType, TelControllerType > & ) = delete;
+
+			grb::RC init_vectors( IOType zero ) {
+				grb::RC rc = grb::set( z, zero );
+				rc = rc ? rc : grb::set( r, zero );
+				return rc;
+			}
+		};
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_DATA
diff --git a/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
new file mode 100644
index 000000000..bd9a393a4
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/multigrid_v_cycle.hpp
@@ -0,0 +1,240 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file multigrid_v_cycle.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * This file contains the routines for multi-grid solution refinement, including the main routine
+ *        and those for coarsening and refinement of the tentative solution.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+#define _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
+
+#include <cassert>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <graphblas.hpp>
+#include <graphblas/utils/iterators/IteratorValueAdaptor.hpp>
+#include <graphblas/utils/telemetry/OutputStream.hpp>
+
+#include "multigrid_data.hpp"
+
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Callable object to invoke the V-cycle multi-grid algorithm, which also requires
+		 * a smoother and a coarsener object.
+		 *
+		 * It is built by transferring into it the state of both the smoother and the coarsener,
+		 * in order to avoid use-after-free issues.
+		 *
+		 * @tparam MGTypes types container for algebraic information (IOType, NonzeroType, Ring, Minus)
+		 * @tparam MGSmootherType type of the smoother runner, with prescribed methods for the various
+		 *  smoothing steps
+		 * @tparam CoarsenerType type of the coarsener runner, with prescribed methods for coarsening
+		 * @tparam descr descriptors with statically-known data for computation and containers
+		 * @tparam DbgOutputStreamType type for the debugging stream, i.e. the stream to trace simulation
+		 * 	results alongside execution; the default type #grb::utils::telemetry::OutputStreamOff disables
+		 * 	all output at compile time
+		 */
+		template<
+			typename MGTypes,
+			typename MGSmootherType,
+			typename CoarsenerType,
+			typename TelControllerType,
+			Descriptor descr = descriptors::no_operation,
+			typename DbgOutputStreamType = grb::utils::telemetry::OutputStreamOff
+		> struct MultiGridRunner {
+
+			using self_t = MultiGridRunner< MGTypes, MGSmootherType, CoarsenerType, TelControllerType, descr >;
+			// algebraic types
+			using IOType = typename MGTypes::IOType;
+			using NonzeroType = typename MGTypes::NonzeroType;
+			using Ring = typename MGTypes::Ring;
+			using Minus = typename MGTypes::Minus;
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelControllerType >;
+			// runners
+			using SmootherRunnerType = MGSmootherType;
+			using CoarsenerRunnerType = CoarsenerType;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+
+			// check the interface between HPCG and MG match
+			static_assert( std::is_base_of< typename MGSmootherType::SmootherInputType, MultiGridInputType >::value,
+				"input type of the Smoother kernel must match the input from Multi-Grid" );
+
+			MGSmootherType & smoother_runner; ///< object to run the smoother
+			CoarsenerType & coarsener_runner; ///< object to run the coarsener
+			DbgOutputStreamType dbg_logger;   ///< logger to trace execution
+
+			std::vector< std::unique_ptr< MultiGridInputType > > system_levels; ///< levels of the grid (finest first)
+			Ring ring; ///< algebraic ring
+			Minus minus; ///< minus operator
+
+			// operator to extract the reference out of an std::unique_ptr object
+			struct __extractor {
+				MultiGridInputType * operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::reference & ref ) {
+					return ref.get();
+				}
+
+				const MultiGridInputType * operator()(
+					typename std::vector< std::unique_ptr< MultiGridInputType > >::const_reference & ref ) const {
+					return ref.get();
+				}
+			};
+
+			using __unique_ptr_extractor = grb::utils::IteratorValueAdaptor<
+				typename std::vector< std::unique_ptr< MultiGridInputType > >::iterator, __extractor >;
+
+			/**
+			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
+			 * smoother and coarsener.
+			 *
+			 * The debug logger is deactivated.
+			 */
+			MultiGridRunner(
+				MGSmootherType & _smoother_runner,
+				CoarsenerType & _coarsener_runner
+			) :
+				smoother_runner( _smoother_runner ),
+				coarsener_runner( _coarsener_runner )
+			{
+				static_assert( std::is_default_constructible< DbgOutputStreamType >::value );
+			}
+
+			/**
+			 * Construct a new MultiGridRunner object by moving in the state of the pre-built
+			 * smoother and coarsener and with a user-given debug logger.
+			 */
+			MultiGridRunner(
+				MGSmootherType & _smoother_runner,
+				CoarsenerType & _coarsener_runner,
+				DbgOutputStreamType & _dbg_logger
+			) :
+				smoother_runner( _smoother_runner ),
+				coarsener_runner( _coarsener_runner ),
+				dbg_logger( _dbg_logger ) {}
+
+			/**
+			 * Operator to invoke a full multi-grid run starting from the given level.
+			 */
+			inline grb::RC operator()( MultiGridInputType & system ) {
+				return this->operator()( __unique_ptr_extractor( system_levels.begin() += system.level ),
+					__unique_ptr_extractor( system_levels.end() ) );
+			}
+
+			/**
+			 * Operator to invoke a multi-grid run among given levels.
+			 */
+			inline grb::RC operator()(
+				__unique_ptr_extractor begin,
+				const __unique_ptr_extractor end
+			) {
+				begin->mg_stopwatch.start();
+				grb::RC ret = multi_grid( begin, end );
+				begin->mg_stopwatch.stop();
+				return ret;
+			}
+
+			/**
+			 * Multi-grid V cycle implementation to refine a given solution.
+			 *
+			 * A full multi-grid run goes through the following steps:
+			 *
+			 * 1. calls the pre-smoother to improve on the initial solution stored into \p mgiter_begin->z
+			 * 2. coarsens the residual vector
+			 * 3. recursively solves the coarser system
+			 * 4. prolongs the coarser solution into the \p mgiter_begin->z
+			 * 5. further smooths the solution wih a post-smoother call
+			 *
+			 * The algorithm moves across grid levels via the STL-like iterators \p mgiter_begin
+			 * and \p mgiter_end and accesses the grid data via the former (using the operator \c * ): when
+			 * \p mgiter_begin \c == \p mgiter_end , a smoothing round is invoked and the recursion halted.
+			 *
+			 * Failuers of GraphBLAS operations are handled by immediately stopping the execution
+			 * and returning the failure code.
+			 *
+			 * @param mgiter_begin iterator pointing to the current level of the multi-grid
+			 * @param mgiter_end end iterator, indicating the end of the recursion
+			 * @return grb::RC if the algorithm could correctly terminate, the error code of the first
+			 *  unsuccessful operation otherwise
+			 */
+			grb::RC multi_grid(
+				__unique_ptr_extractor mgiter_begin,
+				const __unique_ptr_extractor mgiter_end
+			) {
+				RC ret = SUCCESS;
+				assert( mgiter_begin != mgiter_end );
+				MultiGridInputType & finer_system = *mgiter_begin;
+				++mgiter_begin;
+
+				dbg_logger << "mg BEGINNING {" << std::endl;
+
+				// clean destination vector
+				ret = ret ? ret : grb::set< descr >( finer_system.z, ring.template getZero< IOType >() );
+				dbg_logger << ">>> initial r: " << finer_system.r << std::endl;
+
+				if( ! ( mgiter_begin != mgiter_end ) ) {
+					// compute one round of Gauss Seidel and return
+					ret = ret ? ret : smoother_runner.nonrecursive_smooth( finer_system );
+					assert( ret == SUCCESS );
+					dbg_logger << ">>> smoothed z: " << finer_system.z << std::endl;
+					dbg_logger << "} mg END" << std::endl;
+					return ret;
+				}
+				MultiGridInputType & coarser_system = *mgiter_begin;
+
+				// pre-smoother
+				ret = ret ? ret : smoother_runner.pre_smooth( finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> pre-smoothed z: " << finer_system.z << std::endl;
+
+				ret = ret ? ret : coarsener_runner.coarsen_residual( finer_system, coarser_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> coarse r: " << coarser_system.r << std::endl;
+
+				ret = ret ? ret : this->operator()( mgiter_begin, mgiter_end );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : coarsener_runner.prolong_solution( coarser_system, finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> prolonged z: " << finer_system.z << std::endl;
+
+				// post-smoother
+				ret = ret ? ret : smoother_runner.post_smooth( finer_system );
+				assert( ret == SUCCESS );
+				dbg_logger << ">>> post-smoothed z: " << finer_system.z << std::endl;
+				dbg_logger << "} mg END" << std::endl;
+
+				return ret;
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_V_CYCLE
diff --git a/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
new file mode 100644
index 000000000..3b558e9f1
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp
@@ -0,0 +1,244 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file red_black_gauss_seidel.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Contains the routines to perform a forward-backward pass of a Red-Black Gauss-Seidel smoother.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
+#define _H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
+
+#include <cassert>
+
+#include <graphblas.hpp>
+
+#include "multigrid_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Data structures to run the RBGS smoother on a single level of the multi-grid.
+		 */
+		template< typename IOType > struct SmootherData {
+
+			grb::Vector< IOType > A_diagonal;               ///< vector with the diagonal of #A
+			grb::Vector< IOType > smoother_temp;            ///< for smoother's intermediate results
+			std::vector< grb::Vector< bool > > color_masks; ///< for color masks
+
+			/**
+			 * Construct a new SmootherData object from the level size.
+			 */
+			SmootherData( size_t sys_size ) :
+				A_diagonal( sys_size ),
+				smoother_temp( sys_size ) {}
+
+			// for safety, disable copy semantics
+			SmootherData( const SmootherData & o ) = delete;
+
+			SmootherData & operator=( const SmootherData & ) = delete;
+
+			grb::RC init_vectors( IOType zero ) {
+				return grb::set( smoother_temp, zero );
+			}
+		};
+
+		/**
+		 * Runner object for the RBGS smoother, with multiple methods for each type of smoothing step:
+		 * pre-, post- and non-recursive, as invoked during a full run of a multi-grid V-cycle.
+		 *
+		 * It stores the information to smooth each level of the grid, to be initalized separately.
+		 *
+		 * @tparam SmootherTypes container of algebraic tyoes for the smoother (IOType, NonzeroType, Ring)
+		 * @tparam TelControllerType telemetry controller to (de)activate time tracing within passed MultiGridData objects
+		 * @tparam descr descriptors with statically-known data for computation and containers
+		 */
+		template<
+			class SmootherTypes,
+			typename TelControllerType,
+			Descriptor descr = descriptors::no_operation
+		> struct RedBlackGSSmootherRunner {
+
+			// algebraic types
+			using IOType = typename SmootherTypes::IOType;
+			using NonzeroType = typename SmootherTypes::NonzeroType;
+			using Ring = typename SmootherTypes::Ring;
+			using Minus = typename SmootherTypes::Minus;
+			using Divide = typename SmootherTypes::Divide;
+			using SmootherInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< external
+			///< input structure
+			using SmootherDataType = SmootherData< IOType >; ///< smoothing information
+			///< and temporary variables (per MG level)
+
+			size_t presmoother_steps = 1UL; ///< number of pre-smoother steps
+			size_t postsmoother_steps = 1UL; ///< number of post-smoother steps
+			size_t non_recursive_smooth_steps = 1UL; ///< number of smoother steps for the last grid level
+			std::vector< std::unique_ptr< SmootherDataType > > levels; ///< for each grid level,
+			///< the smoothing data (finest first)
+			Ring ring; ///< the algebraic ring
+			Minus minus;
+			Divide divide;
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring operator with default values" );
+
+			inline grb::RC pre_smooth( SmootherInputType & data ) {
+				return run_smoother( data, presmoother_steps );
+			}
+
+			inline grb::RC post_smooth( SmootherInputType & data ) {
+				return run_smoother( data, postsmoother_steps );
+			}
+
+			inline grb::RC nonrecursive_smooth( SmootherInputType & data ) {
+				return run_smoother( data, non_recursive_smooth_steps );
+			}
+
+		protected:
+			/**
+			 * Runs \p smoother_steps iteration of the Red-Black Gauss-Seidel smoother,
+			 * with inputs and outputs stored inside \p data.
+			 *
+			 * This is an internal method called by all user-facing methods, because this specific
+			 * smoother performs all smoothing steps the same way.
+			 */
+			grb::RC run_smoother(
+				SmootherInputType & data,
+				const size_t smoother_steps
+			) {
+				RC ret = SUCCESS;
+
+				SmootherDataType & smoothing_info = *( levels.at( data.level ).get() );
+
+				data.sm_stopwatch.start();
+				for( size_t i = 0; i < smoother_steps && ret == SUCCESS; i++ ) {
+					ret = ret ? ret : red_black_gauss_seidel( data, smoothing_info );
+					assert( ret == SUCCESS );
+				}
+				data.sm_stopwatch.stop();
+				return ret;
+			}
+
+			/**
+			 * Runs a single step of Red-Black Gauss-Seidel for a specific color.
+			 *
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level:
+			 * 	vector to smooth, system matrix, residual
+			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *  unsuccessful operation otherwise
+			 */
+			grb::RC red_black_gauss_seidel_single_step(
+				SmootherInputType & data,
+				SmootherDataType & smoothing_info,
+				size_t color
+			) {
+				const grb::Matrix< NonzeroType > & A = data.A;
+				const grb::Vector< IOType > & A_diagonal = smoothing_info.A_diagonal;
+				const grb::Vector< IOType > & r = data.r;
+				grb::Vector< IOType > & z = data.z;
+				grb::Vector< IOType > & smoother_temp = smoothing_info.smoother_temp;
+				const grb::Vector< bool > & color_mask = smoothing_info.color_masks[ color ];
+
+				// smoother_temp[color_mask] = A[color_mask] * z[color_mask]
+				// use the structural descriptors, assuming ONLY the values of the current color are set
+				// note that if this assumption does not hold, also the following eWiseLambda() is wrong
+				RC ret = grb::mxv< grb::descriptors::safe_overlap | grb::descriptors::structural >(
+					smoother_temp, color_mask, A, z, ring );
+				assert( ret == SUCCESS );
+
+				// TODO internal issue #201
+				// Replace below with masked calls:
+				// z[mask] = r[mask] - smoother_temp[mask] + z[mask] .* diagonal[mask]
+				// z[mask] = z[maks] ./ diagonal[mask]
+
+// by default use foldl()'s, although eWiseLambda() might be more performing
+// TODO: leave this choice for future experimentation
+#if defined(RBGS_EWL)
+				Ring & ri = ring;
+				Minus & mi = minus;
+				Divide & di = divide;
+
+				ret = ret ? ret :
+                            grb::eWiseLambda(
+								[ &z, &r, &smoother_temp, &color_mask, &A_diagonal ,
+									&ri, &mi, &di ]( const size_t i ) {
+									IOType d = A_diagonal[ i ];
+									IOType v;
+									ri.getMultiplicativeOperator().apply( z[ i ], d, v  );
+									ri.getAdditiveOperator().apply( v, r[ i ], v  );
+									mi.apply( v, smoother_temp[ i ], v );
+									di.apply( v, d, z[ i ] );
+								},
+								color_mask, z, r, smoother_temp, A_diagonal );
+#else
+				grb::foldl( z, color_mask, A_diagonal, ring.getMultiplicativeOperator() );
+				grb::foldl( z, color_mask, smoother_temp, minus );
+				grb::foldl( z, color_mask, r, ring.getAdditiveOperator() );
+				grb::foldl( z, color_mask, A_diagonal, divide );
+#endif
+				assert( ret == SUCCESS );
+				return ret;
+			}
+
+			/**
+			 * Runs a single forward and backward pass of Red-Black Gauss-Seidel smoothing
+			 * on the system stored in \p data.
+			 *
+			 * This routine performs a forward and a backward step of Red-Black Gauss-Seidel for each color
+			 * stored in \p data.color_masks. Colors stored inside this container
+			 * <b>are assumed to be mutually exclusive and to cover all rows of the solution vector<\b>,
+			 * and no check is performed to ensure these assumptions hold. Hence, it is up to user logic
+			 * to pass correct coloring information. Otherwise, \b no guarantees hold on the result.
+			 *
+			 * @param[in,out] data structure with external containers, corresponsign to an MG level:
+			 * 	vector to smooth, system matrix, residual
+			 * @param[in,out] smoothing_info smoothing-specific information: temporary vectors, color masks
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			grb::RC red_black_gauss_seidel(
+				SmootherInputType & data,
+				SmootherDataType & smoothing_info
+			) {
+				RC ret = SUCCESS;
+				// zero the temp output just once, assuming proper masking avoids
+				// interference among different colors
+				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
+					ring.template getZero< IOType >() );
+
+				// forward step
+				for( size_t color = 0; color < smoothing_info.color_masks.size(); ++color ) {
+					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color );
+				}
+				ret = ret ? ret : grb::set< descr >( smoothing_info.smoother_temp,
+					ring.template getZero< IOType >() );
+
+				// backward step
+				for( size_t color = smoothing_info.color_masks.size(); color > 0; --color ) {
+					ret = red_black_gauss_seidel_single_step( data, smoothing_info, color - 1 );
+				}
+				return ret;
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // H_GRB_ALGORITHMS_RED_BLACK_GAUSS_SEIDEL
diff --git a/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
new file mode 100644
index 000000000..40f8163f5
--- /dev/null
+++ b/include/graphblas/algorithms/multigrid/single_matrix_coarsener.hpp
@@ -0,0 +1,197 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file single_matrix_coarsener.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Implementation of a coarsener using the same matrix for both coarsening and prolongation.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
+#define _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
+
+#include <memory>
+#include <vector>
+
+#include <graphblas.hpp>
+
+#include "multigrid_data.hpp"
+
+namespace grb {
+	namespace algorithms {
+
+		/**
+		 * Structure storing the data for the coarsener.
+		 */
+		template<
+			typename IOType,
+			typename NonzeroType
+		> struct CoarseningData {
+
+			grb::Matrix< NonzeroType > coarsening_matrix; ///< matrix of size #system_size \f$ \times \f$ #finer_size
+			///< to coarsen an input vector of size #finer_size into a vector of size #system_size
+			grb::Vector< IOType > Ax_finer; ///< finer vector for intermediate computations, of size #finer_size
+
+			/**
+			 * Construct a new CoarseningData object by initializing internal data structures.
+			 *
+			 * @param[in] _finer_size  size of the finer system, i.e. size of external objects \b before coarsening
+			 * @param[in] coarser_size size of the current system, i.e. size \b after coarsening
+			 */
+			CoarseningData(
+				size_t _finer_size,
+				size_t coarser_size
+			) :
+				coarsening_matrix( coarser_size, _finer_size ),
+				Ax_finer( _finer_size ) {}
+
+			grb::RC init_vectors( IOType zero ) {
+				return grb::set( Ax_finer, zero );
+			}
+		};
+
+		/**
+		 * Runner structure, holding the data to coarsen the levels of a multi-grid simulation.
+		 *
+		 * This coarsener just uses the same matrix to perform the coarsening (via an mxv())
+		 * and the prolongation, using it transposed.
+		 */
+		template<
+			class CoarsenerTypes,
+			typename TelControllerType,
+			Descriptor descr = descriptors::no_operation
+		> struct SingleMatrixCoarsener {
+
+			// algebraic types
+			using IOType = typename CoarsenerTypes::IOType;
+			using NonzeroType = typename CoarsenerTypes::NonzeroType;
+			using Ring = typename CoarsenerTypes::Ring;
+			using Minus = typename CoarsenerTypes::Minus;
+
+			using MultiGridInputType = MultiGridData< IOType, NonzeroType, TelControllerType >; ///< input data from MG
+			using CoarseningDataType = CoarseningData< IOType, NonzeroType >; ///< internal data
+			///< with coarsening information
+
+			static_assert( std::is_default_constructible< Ring >::value,
+				"cannot construct the Ring with default values" );
+			static_assert( std::is_default_constructible< Minus >::value,
+				"cannot construct the Minus operator with default values" );
+
+			/**
+			 * Data to coarsen each level, from finer to coarser.
+			 */
+			std::vector< std::unique_ptr< grb::algorithms::CoarseningData< IOType, NonzeroType > > > coarsener_levels;
+			Ring ring;
+			Minus minus;
+
+			/**
+			 * Method required by MultiGridRunner before the recursive call, to coarsen
+			 * the residual vector of \p finer (the finer system) into the residual of
+			 * \p coarser (the coarser system).
+			 */
+			inline grb::RC coarsen_residual(
+				const MultiGridInputType & finer,
+				MultiGridInputType & coarser
+			) {
+				// first compute the residual
+				CoarseningData< IOType, NonzeroType > & coarsener = *coarsener_levels[ finer.level ];
+				grb::RC ret = grb::set< descr >( coarsener.Ax_finer, ring.template getZero< IOType >() );
+				ret = ret ? ret : grb::mxv< descr >( coarsener.Ax_finer, finer.A, finer.z, ring );
+
+				return ret ? ret : compute_coarsening( finer.r, coarser.r, coarsener );
+			}
+
+			/**
+			 * Method required by MultiGridRunner after the recursive call, to "prolong" the coarser solution
+			 * into the finer solution.
+			 */
+			inline grb::RC prolong_solution(
+				const MultiGridInputType & coarser,
+				MultiGridInputType & finer
+			) {
+				return compute_prolongation( coarser.z, finer.z, *coarsener_levels[ finer.level ] );
+			}
+
+		protected:
+			/**
+			 * computes the coarser residual vector \p CoarseningData.r by coarsening
+			 *        \p coarsening_data.Ax_finer - \p r_fine via \p coarsening_data.coarsening_matrix.
+			 *
+			 * The coarsening information are stored inside \p CoarseningData.
+			 *
+			 * @param[in] r_fine fine residual vector
+			 * @param[out] r_coarse coarse residual vector, the output
+			 * @param[in,out] coarsening_data \ref MultiGridData data structure storing the information for coarsening
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 *                          unsuccessful operation otherwise
+			 */
+			grb::RC compute_coarsening(
+				const grb::Vector< IOType > & r_fine,
+				grb::Vector< IOType > & r_coarse,
+				CoarseningData< IOType, NonzeroType > & coarsening_data
+			) {
+				RC ret = SUCCESS;
+				ret = ret ? ret : grb::eWiseApply< descr >( coarsening_data.Ax_finer, r_fine,
+					coarsening_data.Ax_finer, minus ); // Ax_finer = r_fine - Ax_finer
+				assert( ret == SUCCESS );
+
+				// actual coarsening, from  ncols(*coarsening_data->A) == *coarsening_data->system_size * 8
+				// to *coarsening_data->system_size
+				ret = ret ? ret : grb::set< descr >( r_coarse, ring.template getZero< IOType >() );
+				ret = ret ? ret : grb::mxv< descr >( r_coarse, coarsening_data.coarsening_matrix,
+					coarsening_data.Ax_finer, ring ); // r = coarsening_matrix * Ax_finer
+				return ret;
+			}
+
+			/**
+			 * computes the prolongation of the coarser solution \p coarsening_data.z and stores it into
+			 * \p z_fine.
+			 *
+			 * For prolongation, this function uses the matrix \p coarsening_data.coarsening_matrix by transposing it.
+			 *
+			 * @param[out] z_coarse input solution vector, to be coarsened
+			 * @param[out] z_fine the solution vector to store the prolonged solution into
+			 * @param[in,out] coarsening_data information for coarsening
+			 * @return grb::RC::SUCCESS if the algorithm could correctly terminate, the error code of the first
+			 * unsuccessful operation otherwise
+			 */
+			grb::RC compute_prolongation(
+				const grb::Vector< IOType > & z_coarse,
+				grb::Vector< IOType > & z_fine, // fine residual
+				grb::algorithms::CoarseningData< IOType, NonzeroType > & coarsening_data
+			) {
+				RC ret = SUCCESS;
+				// actual refining, from  *coarsening_data->syztem_size == nrows(*coarsening_data->A) / 8
+				// to nrows(z_fine)
+				ret = ret ? ret : grb::set< descr >( coarsening_data.Ax_finer,
+					ring.template getZero< IOType >() );
+
+				ret = ret ? ret : grb::mxv< descr | grb::descriptors::transpose_matrix >(
+					coarsening_data.Ax_finer, coarsening_data.coarsening_matrix, z_coarse, ring );
+				assert( ret == SUCCESS );
+
+				ret = ret ? ret : grb::foldl< descr >( z_fine, coarsening_data.Ax_finer,
+					ring.getAdditiveMonoid() ); // z_fine += Ax_finer;
+				assert( ret == SUCCESS );
+				return ret;
+			}
+		};
+
+	} // namespace algorithms
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_HPCG_SINGLE_MATRIX_COARSENER
diff --git a/include/graphblas/nonblocking/matrix.hpp b/include/graphblas/nonblocking/matrix.hpp
index 251e2037d..5554d78ae 100644
--- a/include/graphblas/nonblocking/matrix.hpp
+++ b/include/graphblas/nonblocking/matrix.hpp
@@ -50,7 +50,6 @@
 #include <graphblas/utils/DMapper.hpp>
 #include <graphblas/type_traits.hpp>
 
-#include <graphblas/algorithms/hpcg/ndim_matrix_builders.hpp>
 #include <graphblas/utils/iterators/utils.hpp>
 
 #include <graphblas/reference/NonzeroWrapper.hpp>
diff --git a/include/graphblas/reference/vector.hpp b/include/graphblas/reference/vector.hpp
index f0db908b2..e6167a868 100644
--- a/include/graphblas/reference/vector.hpp
+++ b/include/graphblas/reference/vector.hpp
@@ -478,18 +478,19 @@ namespace grb {
 
 			// perform straight copy
 			fwd_iterator it = start;
-			for( size_t i = 0; start != end && i < _coordinates.size(); ++i ) {
+			for( size_t i = 0; it != end && i < _coordinates.size(); ++i ) {
 				// flag coordinate as assigned
 				if( _coordinates.assign( i ) ) {
 					if( descr & descriptors::no_duplicates ) {
 						return ILLEGAL;
 					}
 					// nonzero already existed, so fold into existing one
-					foldl( _raw[ i ], *it++, dup );
+					foldl( _raw[ i ], *it, dup );
 				} else {
 					// new nonzero, so overwrite
-					_raw[ i ] = static_cast< D >( *it++ );
+					_raw[ i ] = static_cast< D >( *it );
 				}
+				++it;
 			}
 
 			// write back final position
@@ -538,7 +539,9 @@ namespace grb {
 			nnz_iterator nnz = nnz_start;
 			ind_iterator ind = ind_start;
 			while( nnz != nnz_end || ind != ind_end ) {
-				const size_t i = static_cast< size_t >( *ind++ );
+				const size_t i = static_cast< size_t >( *ind );
+				++ind;
+
 				// sanity check
 				if( i >= _coordinates.size() ) {
 					return MISMATCH;
@@ -547,10 +550,11 @@ namespace grb {
 					if( descr & descriptors::no_duplicates ) {
 						return ILLEGAL;
 					}
-					foldl( _raw[ i ], *nnz++, dup );
+					foldl( _raw[ i ], *nnz, dup );
 				} else {
-					_raw[ i ] = static_cast< D >( *nnz++ );
+					_raw[ i ] = static_cast< D >( *nnz );
 				}
+				++nnz;
 			}
 
 			// done
diff --git a/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
new file mode 100644
index 000000000..ebac6ca02
--- /dev/null
+++ b/include/graphblas/utils/iterators/IteratorValueAdaptor.hpp
@@ -0,0 +1,161 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/utils/iterators
+ * Various utilities to work with STL-like iterators and ALP/GraphBLAS iterators:
+ * adaptors, partitioning facilities, traits and functions to check compile-time
+ * and runtime properties.
+ */
+
+/**
+ * @file IteratorValueAdaptor.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of an adaptor to extract a given value out of an iterator.
+ */
+
+#ifndef H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
+#define H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
+
+#include <type_traits>
+#include <iterator>
+#include <utility>
+
+namespace grb {
+	namespace utils {
+
+		/**
+		 * Adaptor for an iterator, to extract the value pointed to by the * operator.
+		 * It wraps an iterator under the same interface, using an object of type \a AdaptorType
+		 * to adapt the returned value.
+		 *
+		 * @tparam InnerIterType type of the underlying iterator
+		 * @tparam AdaptorType type of the adaptor, to be instantiated by default
+		 */
+		template<
+			typename InnerIterType,
+			typename AdaptorType
+		> struct IteratorValueAdaptor {
+
+			static_assert( std::is_copy_constructible< AdaptorType >::value,
+				"AdaptorType must be copy-constructible" );
+			static_assert( std::is_copy_assignable< AdaptorType >::value,
+				"AdaptorType must be copy-assignable" );
+
+			typedef typename std::decay<
+				decltype( *std::declval< AdaptorType >()( *std::declval< InnerIterType >() ) )>::type value_type;
+			typedef value_type & reference;
+			typedef value_type * pointer;
+			typedef const value_type * const_pointer;
+			typedef typename std::iterator_traits< InnerIterType >::iterator_category iterator_category;
+			typedef typename std::iterator_traits< InnerIterType >::difference_type difference_type;
+
+			static constexpr bool is_random_access = std::is_base_of<
+				std::random_access_iterator_tag, iterator_category >::value;
+
+			InnerIterType iter;
+			AdaptorType adaptor;
+
+			using SelfType = IteratorValueAdaptor< InnerIterType, AdaptorType >;
+
+			/**
+			 * Construct a new IteratorValueAdaptor object from an actual iterator.
+			 * The adaptor is built via its default constructor.
+			 */
+			IteratorValueAdaptor( typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
+				const InnerIterType & >::type _iter ) :
+				iter( _iter ),
+				adaptor() {}
+
+			/**
+			 * Construct a new IteratorValueAdaptor object from an iterator and an existing adaptor object.
+			 */
+			IteratorValueAdaptor(
+				const InnerIterType &_iter,
+				const AdaptorType &_adaptor
+			) :
+				iter( _iter ),
+				adaptor( _adaptor ) {}
+
+			/**
+			 * Construct a new Iterator Value Adaptor object from an actual iterator.
+			 * The adaptor is built via its default constructor.
+			 *
+			 * @param _iter the underlying iterator, to be moved
+			 */
+			IteratorValueAdaptor( typename std::enable_if< std::is_default_constructible< AdaptorType >::value,
+				InnerIterType && >::type _iter
+			) :
+				iter( std::move( _iter ) ),
+				adaptor() {}
+
+			/**
+			 * Construct a new IteratorValueAdaptor object from an actual iterator
+			 * and an existing adaptor object by moving their state.
+			 */
+			IteratorValueAdaptor(
+				InnerIterType &&_iter,
+				AdaptorType &&_adaptor
+			) :
+				iter( std::move( _iter ) ),
+				adaptor( std::move( _adaptor ) ) {}
+
+			IteratorValueAdaptor() = delete;
+
+			// since it is an iterator, we MUST have copy and move semantics
+			IteratorValueAdaptor( const SelfType & ) = default;
+
+			IteratorValueAdaptor( SelfType && ) = default;
+
+			SelfType& operator=( const SelfType & ) = default;
+
+			SelfType& operator=( SelfType && ) = default;
+
+			bool operator!=( const SelfType & o ) const { return o.iter != iter; }
+
+			bool operator==( const SelfType & o ) const { return ! operator!=( o ); }
+
+			reference operator*() { return *adaptor( *iter ); }
+
+			const reference operator*() const { return *adaptor( *iter ); }
+
+			pointer operator->() { return adaptor( *iter ); }
+
+			const_pointer operator->() const { return adaptor( *iter ); }
+
+			SelfType& operator++() { ++iter; return *this; }
+
+			SelfType & operator+=(
+				typename std::enable_if< is_random_access,
+				const size_t >::type offset
+			) {
+				iter += offset;
+				return *this;
+			}
+
+			difference_type operator-(
+				typename std::enable_if< is_random_access,
+				const SelfType & >::type other
+			) {
+				return iter - other.iter;
+			}
+		};
+
+	} // end namespace utils
+} // end namespace grb
+
+#endif // H_GRB_UTILS_ITERATOR_VALUE_ADAPTOR
diff --git a/include/graphblas/utils/iterators/partition_range.hpp b/include/graphblas/utils/iterators/partition_range.hpp
new file mode 100644
index 000000000..60d228b3a
--- /dev/null
+++ b/include/graphblas/utils/iterators/partition_range.hpp
@@ -0,0 +1,106 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file partition_range.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of utilities to partition iterators across processes.
+ */
+
+#include <cstddef>
+#include <algorithm>
+#include <cassert>
+
+#ifndef H_GRB_UTILS_PARTITION_RANGE
+#define H_GRB_UTILS_PARTITION_RANGE
+
+namespace grb {
+	namespace utils {
+
+		/**
+		 * Partitions the size of a collection across processes and computes the first offset
+		 * and the size for the local partition.
+		 *
+		 * @tparam T size type
+		 * @param[in] num_procs total number of processes
+		 * @param[in] this_proc ID of current process
+		 * @param[in] num_elements total number of elements in the collection
+		 * @param[out] first_offset offset to the first element of the local partition
+		 * @param[out] local_size size of the local partition
+		 */
+		template< typename T > void partition_collection_size(
+				size_t num_procs,
+				size_t this_proc,
+				T num_elements,
+				T& first_offset,
+				T& local_size
+		) {
+			const T per_process = ( num_elements + num_procs - 1 ) / num_procs; // round up
+			first_offset = std::min( per_process * static_cast< T >( this_proc ), num_elements );
+			local_size = std::min( first_offset + per_process, num_elements );
+		}
+
+		/**
+		 * Partitions an iteration range across processes according to the given information.
+		 *
+		 * With \p num_procs processes and \p this_proc < \p num_procs and a collection of \p num_elements
+		 * elements across all processes, it partitions the collection evenly among processes and sets
+		 * \p begin and \p end so that they iterate over the local partition designated by \p this_proc.
+		 *
+		 * It works also for a single-process scenario.
+		 *
+		 * Note: the number of processes and the ID of the current process is expected in input
+		 * not to introduce dependencies on separate code paths.
+		 *
+		 * @tparam IterT iterator type
+		 * @param[in] num_procs number of processes
+		 * @param[in] this_proc Id of current process
+		 * @param[in] num_elements number of elements of the collection; it can be computed as
+		 *  \code std::distance( begin, end ) \endcode
+		 * @param[out] begin beginning iterator to the whole collection
+		 * @param[out] end end iterator
+		 */
+		template< typename IterT > void partition_iteration_range_on_procs(
+			size_t num_procs,
+			size_t this_proc,
+			size_t num_elements,
+			IterT &begin,
+			IterT &end
+		) {
+			static_assert( std::is_base_of< std::random_access_iterator_tag,
+				typename std::iterator_traits< IterT >::iterator_category >::value,
+				"the given iterator is not a random access one" );
+			assert( this_proc < num_procs );
+			assert( num_elements == static_cast< size_t >( end - begin ) );
+			if( num_procs == 1 ) {
+				return;
+			}
+			size_t first, num_local_elements;
+			partition_collection_size( num_procs, this_proc, num_elements, first, num_local_elements );
+			if( num_local_elements < num_elements ) {
+				end = begin;
+				end += num_local_elements;
+			}
+			if( first > 0 ) {
+				begin += first;
+			}
+		}
+
+	} // namespace utils
+} // namespace grb
+
+#endif // H_GRB_UTILS_PARTITION_RANGE
diff --git a/include/graphblas/utils/iterators/utils.hpp b/include/graphblas/utils/iterators/utils.hpp
index b56899c83..0b635578d 100644
--- a/include/graphblas/utils/iterators/utils.hpp
+++ b/include/graphblas/utils/iterators/utils.hpp
@@ -25,6 +25,8 @@
 #define _H_GRB_ITERATOR_UTILS
 
 #include <cstddef>
+#include <algorithm>
+#include <type_traits>
 
 #include <graphblas/rc.hpp>
 #include <graphblas/type_traits.hpp>
@@ -78,6 +80,28 @@ namespace grb {
 			return SUCCESS;
 		}
 
+		/**
+		 * Computes the difference between \p a \a - \p b and returns it as the given
+		 * type \p DiffType.
+		 *
+		 * Raises an exception if \p DiffType cannot store the difference.
+		 */
+		template<
+			typename DiffType,
+			typename SizeType
+		> DiffType compute_signed_distance(
+			const SizeType a,
+			const SizeType b
+		) {
+			static_assert( std::is_signed< DiffType >::value, "DiffType should be signed" );
+			const SizeType diff = std::max( a, b ) - std::min( a, b );
+			if( diff > static_cast< SizeType >( std::numeric_limits< DiffType >::max() ) ) {
+				throw std::range_error( "cannot represent difference" );
+			}
+			DiffType result = static_cast< DiffType >( diff );
+			return a >= b ? result : -result ;
+		}
+
 	} // end namespace utils
 
 } // end namespace grb
diff --git a/include/graphblas/utils/multigrid/array_vector_storage.hpp b/include/graphblas/utils/multigrid/array_vector_storage.hpp
new file mode 100644
index 000000000..cfca1dda2
--- /dev/null
+++ b/include/graphblas/utils/multigrid/array_vector_storage.hpp
@@ -0,0 +1,111 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file array_vector_storage.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Extension of std::array<> exposing a larger interface and the underlying
+ * storage structure.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <stdexcept>
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Array with fixed size based on std::array with an interface compliant to what other classes
+			 * in the geometry namespace expect, like #storage() and #dimensions() methods.
+			 *
+			 * It describes a vector of dimensions #dimensions().
+			 *
+			 * @tparam DIMS the dimensions of the vector
+			 * @tparam DataType the data type of the vector elements
+			 */
+			template<
+				size_t DIMS,
+				typename DataType
+			> class ArrayVectorStorage : public std::array< DataType, DIMS > {
+			public:
+				using VectorStorageType = std::array< DataType, DIMS > &;
+				using ConstVectorStorageType = const std::array< DataType, DIMS > &;
+				using SelfType = ArrayVectorStorage< DIMS, DataType >;
+
+				/**
+				 * Construct a new Array Vector Storage object of given dimensions;
+				 * internal values are \b not initialized.
+				 *
+				 * \p _dimensions must be equal to \p DIMS, or an exception is thrown.
+				 */
+				ArrayVectorStorage( size_t _dimensions ) {
+					static_assert( DIMS > 0, "cannot allocate 0-sized array" );
+					if( _dimensions != DIMS ) {
+						throw std::invalid_argument( "given dimensions must match the type dimensions" );
+					}
+				}
+
+				ArrayVectorStorage() = delete;
+
+				// only copy constructor/assignment, since there's no external storage
+				ArrayVectorStorage( const SelfType & o ) noexcept {
+					std::copy_n( o.cbegin(), DIMS, this->begin() );
+				}
+
+				ArrayVectorStorage( SelfType && o ) = delete;
+
+				SelfType & operator=( const SelfType & original ) noexcept {
+					std::copy_n( original.begin(), DIMS, this->begin() );
+					return *this;
+				}
+
+				SelfType & operator=( SelfType && original ) = delete;
+
+				/**
+				 * Returns the geometrical dimensions of this vector, i.e. of the
+				 * geometrical space it refers to.
+				 */
+				constexpr size_t dimensions() const {
+					return DIMS;
+				}
+
+				/**
+				 * Returns a reference to the underlying storage object.
+				 */
+				inline VectorStorageType storage() {
+					return *this;
+				}
+
+				/**
+				 * Returns a const reference to the underlying storage object.
+				 */
+				inline ConstVectorStorageType storage() const {
+					return *this;
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_ARRAY_VECTOR_STORAGE
diff --git a/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
new file mode 100644
index 000000000..0d6250aae
--- /dev/null
+++ b/include/graphblas/utils/multigrid/dynamic_vector_storage.hpp
@@ -0,0 +1,150 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file dynamic_vector_storage.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Extension of a heap-allocated array exposing the underlying storage and iterators.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
+#define _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
+
+#include <algorithm>
+#include <cstddef>
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Array with fixed size (i.e. decided at object creation) allocated on the heap
+			 * with an interface compliant to what other classes in the geometry namespace expect,
+			 * like storage() and dimensions() methods.
+			 *
+			 * It describes a vector of dimensions #dimensions().
+			 *
+			 * @tparam DataType the data type of the vector elements
+			 */
+			template< typename DataType > class DynamicVectorStorage {
+
+				size_t _dimensions;
+				DataType * _storage;
+
+				void clean() {
+					if( this->_storage != nullptr ) {
+						delete[] this->_storage;
+					}
+				}
+
+			public:
+				// iterator fields
+				using reference = DataType &;
+				using const_reference = const DataType &;
+				using iterator = DataType *;
+				using const_iterator = const DataType *;
+				using pointer = DataType *;
+				using const_pointer = const DataType *;
+
+				using VectorStorageType = DataType *;
+				using ConstVectorStorageType = DataType *;
+				using SelfType = DynamicVectorStorage< DataType >;
+
+				DynamicVectorStorage( size_t __dimensions ) : _dimensions( __dimensions ) {
+					if( __dimensions == 0 ) {
+						throw std::invalid_argument( "dimensions cannot be 0" );
+					}
+					this->_storage = new DataType[ __dimensions ];
+				}
+
+				DynamicVectorStorage() = delete;
+
+				DynamicVectorStorage( const SelfType & o ) :
+					_dimensions( o._dimensions ),
+					_storage( new DataType[ o._dimensions ] )
+				{
+					std::copy_n( o._storage, o._dimensions, this->_storage );
+				}
+
+				DynamicVectorStorage( SelfType && o ) = delete;
+
+				SelfType & operator=( const SelfType & original ) {
+					if( original._dimensions != this->_dimensions ) {
+						this->clean();
+						this->_storage = new DataType[ original._dimensions ];
+					}
+					this->_dimensions = original._dimensions;
+					std::copy_n( original._storage, original._dimensions, this->_storage );
+					return *this;
+				}
+
+				SelfType & operator=( SelfType && original ) = delete;
+
+				~DynamicVectorStorage() {
+					this->clean();
+				}
+
+				size_t dimensions() const {
+					return this->_dimensions;
+				}
+
+				inline iterator begin() {
+					return this->_storage;
+				}
+
+				inline iterator end() {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline const_iterator begin() const {
+					return this->_storage;
+				}
+
+				inline const_iterator end() const {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline const_iterator cbegin() const {
+					return this->_storage;
+				}
+
+				inline const_iterator cend() const {
+					return this->_storage + this->_dimensions;
+				}
+
+				inline VectorStorageType storage() {
+					return this->_storage;
+				}
+
+				inline ConstVectorStorageType storage() const {
+					return this->_storage;
+				}
+
+				inline reference operator[]( size_t pos ) {
+					return *( this->_storage + pos );
+				}
+
+				inline const_reference operator[]( size_t pos ) const {
+					return *( this->_storage + pos );
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_DYNAMIC_VECTOR_STORAGE
diff --git a/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
new file mode 100644
index 000000000..ebda27890
--- /dev/null
+++ b/include/graphblas/utils/multigrid/halo_matrix_generator_iterator.hpp
@@ -0,0 +1,246 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/utils/multigrid
+ * This folder contains various utilities to describe an N-dimensional mesh (possibly with halo)
+ * and iterate through its elements and through the neighbors of each element, possible generating
+ * a matrix out of this information.
+ *
+ * These facilities are used to generate system matrices and various inputs for multi-grid simulations.
+ */
+
+/**
+ * @file halo_matrix_generator_iterator.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of HaloMatrixGeneratorIterator.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
+
+#include <cstddef>
+
+#include "array_vector_storage.hpp"
+#include "linearized_halo_ndim_system.hpp"
+#include "linearized_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Iterator type to generate a matrix on top of the couples <element>-<neighbor> of an
+			 * \p DIMS -dimensional mesh.
+			 *
+			 * This iterator is random-access and meets the the interface of an ALP/GraphBLAS
+			 * input iterator, i.e. an object of this type \a it has methods \a i(), \a j() and
+			 * \a v() to describe a nonzero triplet (row index, column index and value, respectively).
+			 *
+			 * This data structure is based on the LinearizedHaloNDimIterator class, esentially wrapping the
+			 * underlying element index as \a i() and the neighbor index as \a j(); the value \a v()
+			 * is user-customizable via a functor of type \p ValueCallable, which emits the nonzero
+			 * of type \p ValueType based on the passed values of \a i() and \a j().
+			 *
+			 * @tparam DIMS number of dimensions
+			 * @tparam CoordType tyoe storing the coordinate and the system sizes along each dimension
+			 * @tparam ValueType type of nonzeroes
+			 * @tparam ValueCallable callable object producing the nonzero value based on \a i() and \a j()
+			 */
+			template<
+				size_t DIMS,
+				typename CoordType,
+				typename ValueType,
+				typename ValueCallable
+			> struct HaloMatrixGeneratorIterator {
+
+				static_assert( std::is_copy_constructible< ValueCallable >::value,
+					"ValueCallable must be copy-constructible" );
+
+				using RowIndexType = CoordType; ///< numeric type of rows
+				using ColumnIndexType = CoordType;
+				using LinearSystemType = LinearizedHaloNDimSystem< DIMS, RowIndexType >;
+				using SelfType = HaloMatrixGeneratorIterator< DIMS, CoordType, ValueType, ValueCallable >;
+				using Iterator = typename LinearSystemType::Iterator;
+
+				struct HaloPoint {
+
+					friend SelfType;
+
+					HaloPoint(
+						const ValueCallable & value_producer,
+						RowIndexType i,
+						ColumnIndexType j
+					) noexcept :
+						_value_producer( value_producer ),
+						_i( i ),
+						_j( j ) {}
+
+					HaloPoint( const HaloPoint & ) = default;
+
+					HaloPoint & operator=( const HaloPoint & ) = default;
+
+					inline RowIndexType i() const {
+						return _i;
+					}
+					inline ColumnIndexType j() const {
+						return _j;
+					}
+					inline ValueType v() const {
+						return _value_producer( _i, _j );
+					}
+
+				private:
+					ValueCallable _value_producer;
+					RowIndexType _i;
+					ColumnIndexType _j;
+				};
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = HaloPoint;
+				using pointer = value_type;
+				using reference = value_type;
+				using difference_type = typename Iterator::difference_type;
+
+				/**
+				 * Construct a new \c HaloMatrixGeneratorIterator object, setting the current row as \p row
+				 * and emitting \p diag if the iterator has moved on the diagonal, \p non_diag otherwise.
+				 *
+				 * @param sizes array with the sizes along the dimensions
+				 * @param _halo halo of points to iterate around; must be > 0
+				 * @param diag value to emit when on the diagonal
+				 * @param non_diag value to emit outside the diagonal
+				 */
+				HaloMatrixGeneratorIterator(
+					const LinearSystemType & system,
+					const ValueCallable & value_producer
+				) noexcept :
+					_val( value_producer, 0, 0 ),
+					_lin_system( &system ),
+					_sys_iter( system.begin() )
+				{
+					update_coords();
+				}
+
+				HaloMatrixGeneratorIterator( const SelfType & ) = default;
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				/**
+				 * Increments the iterator by moving coordinates to the next (row, column) to iterate on.
+				 *
+				 * This operator internally increments the columns coordinates until wrap-around, when it increments
+				 * the row coordinates and resets the column coordinates to the first possible columns;
+				 * this column coordinate depends on the row coordinates according to the dimensions
+				 * iteration order and on the parameter \p halo.
+				 *
+				 * @return HaloMatrixGeneratorIterator<DIMS, T>& \c this object, with the updated state
+				 */
+				SelfType & operator++() noexcept {
+					(void)++_sys_iter;
+					update_coords();
+					return *this;
+				}
+
+				SelfType & operator+=( size_t offset ) {
+					_sys_iter += offset;
+					update_coords();
+					return *this;
+				}
+
+				difference_type operator-( const SelfType & other ) const {
+					return this->_sys_iter - other._sys_iter;
+				}
+
+				/**
+				 * Operator to compare \c this against \p o  and return whether they differ.
+				 *
+				 * @param o object to compare \c this against
+				 * @return true of the row or the column is different between \p o and \c this
+				 * @return false if both row and column of \p o and \c this are equal
+				 */
+				bool operator!=( const SelfType & o ) const {
+					return this->_sys_iter != o._sys_iter;
+				}
+
+				/**
+				 * Operator to compare \c this against \p o  and return whether they are equal.
+				 *
+				 * @param o object to compare \c this against
+				 * @return true of the row or the column is different between \p o and \c this
+				 * @return false if both row and column of \p o and \c this are equal
+				 */
+				bool operator==( const SelfType & o ) const {
+					return ! operator!=( o );
+				}
+
+				/**
+				 * Operator returning the triple to directly access row, column and element values.
+				 *
+				 * Useful when building the matrix by copying the triple of coordinates and value,
+				 * like for the BSP1D backend.
+				 */
+				reference operator*() const {
+					return _val;
+				}
+
+				pointer operator->() const {
+					return &_val;
+				}
+
+				/**
+				 * Returns the current row.
+				 */
+				inline RowIndexType i() const {
+					return _val.i();
+				}
+
+				/**
+				 * Returns the current column.
+				 */
+				inline ColumnIndexType j() const {
+					return _val.j();
+				}
+
+				/**
+				 * Returns the current matrix value.
+				 *
+				 * @return ValueType #diagonal_value if \code row == column \endcode (i.e. if \code this-> \endcode
+				 * #i() \code == \endcode \code this-> \endcode #j()), #non_diagonal_value otherwise
+				 */
+				inline ValueType v() const {
+					return _val.v();
+				}
+
+			private:
+				value_type _val;
+				const LinearSystemType * _lin_system;
+				Iterator _sys_iter;
+
+				void update_coords() {
+					_val._i = _sys_iter->get_element_linear();
+					_val._j = _sys_iter->get_neighbor_linear();
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_HALO_MATRIX_GENRATOR_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
new file mode 100644
index 000000000..6c020c39d
--- /dev/null
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_iterator.hpp
@@ -0,0 +1,391 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file linearized_halo_ndim_iterator.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedHaloNDimSystem.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
+
+#include <cstddef>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+#include <graphblas/utils/iterators/utils.hpp>
+
+#include "array_vector_storage.hpp"
+#include "linearized_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			// forward declaration
+			template<
+				size_t DIMS,
+				typename SizeType
+			> class LinearizedHaloNDimSystem;
+
+			/**
+			 * Class to iterate over the \b neighbors of a system with halo: by advancing the iterator,
+			 * the user can traverse all neighbors of all elements one-by-one, in order, for example, to
+			 * emit all possible copies element-neighbor.
+			 *
+			 * Example: for a 2-dimensional 3 x 3 system with halo 1, with elements numbered as in
+			 *
+			 * 0 1 2
+			 * 3 4 5
+			 * 6 7 8
+			 *
+			 * the emitted couples <element-neighbor> are:
+			 *
+			 * 0-0, 0-1, 0-3, 0-4; 1-0, 1-1, 1-2, 1-3, 1-4, 1-5; 2-1, 2-2, 2-4, 2-5;
+			 * 3-0, 3-1, 3-3, 3-4; 4-0, 4-1, 4-2, 4-3, 4-4, 4-5, 4-6, 4-7, 4-8; and so on.
+			 *
+			 * It implements two interfaces for iteration. The first is a standard STL-like
+			 * interface meeting the random-access requirements, with operators \a ++, \a *, \a ->,
+			 * \a +=, \a -, \a ==; these facilities iterate over \b all neighbors of the underlying system,
+			 * automatically updating the corresponding element the neighbor is associated to.
+			 * The second interface is a custom (Java-like) one that allows to iterate separately over elements
+			 * and their neighbors: the user can query whether more elements exist, move to the next element,
+			 * iterate over the neighbors of the current element, query whether more neighbors exist for the
+			 * current element.
+			 *
+			 * The state of this structure essentially contains:
+			 *
+			 * 1. a const-pointer to a LinearizedHaloNDimSystem<DIMS,SizeType> object, storing the geometry
+			 * information of the N-dimensional system.
+			 * 2. the iterator to the current element (which in turn provides the element's vector
+			 *  and linear coordinates)
+			 * 3. the vector coordinate of the current neighbor
+			 * 4. the linear coordinate of the current neighbor
+			 * 5. information about the current element's neighbors space:
+			 *   1. the N-dimensional sub-space of neighbors w.r.t. the current element: this
+			 *    LinearizedHaloNDimSystem<DIMS,SizeType> object stores the sizes of the neighbors's sub-space
+			 *    centered around the current element (at most <em>2 * halo + 1</em> per dimension, if the current
+			 *    element is an inner one); hence, it computes coordinates and provides iterators that are
+			 *    \b relative to the current element
+			 *   2. vector coordinates of the first neighbor of the current element, in the main system
+			 *    (i.e. \b not relative); this allows computing any neighbor as the sum of this vector
+			 *    plus its relative coordinates in the neighbors' sub-space
+			 *   3. iterator to the current neighbor, built out of the relative sub-space, to actually iterate
+			 *    over the current element's neighbors
+			 *   4. iterator to the last neighbor of the current element, to stop the iteration over neighbors
+			 *    and advance to the next element.
+			 *
+			 * The above-mentioned methods to advance the iterator \c this (over neighbors or elements)
+			 * take care of updating these structures properly, keeping the state \b always coherent.
+			 *
+			 * @tparam DIMS syztem number of dimensions
+			 * @tparam SizeType type of coordinates and of sizes (must be large enough to describe the size
+			 * of the system along each direction)
+			 */
+			template< size_t DIMS, typename SizeType >
+			class LinearizedHaloNDimIterator {
+
+				using SystemType = LinearizedHaloNDimSystem< DIMS, SizeType >;
+				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
+				using VectorIteratorType = LinearizedNDimIterator< SizeType, VectorType >;
+
+			public:
+				using ConstVectorReference = typename VectorIteratorType::ConstVectorReference;
+				using SelfType = LinearizedHaloNDimIterator< DIMS, SizeType >;
+
+				/**
+				 * Structure holding the information about a neighbor in a system: its linear
+				 * and vector coordinates and the element it is neighbor of (in the form of both
+				 * linear and vectoor coordinate).
+				 */
+				struct HaloNDimElement {
+				private:
+					// for linearization
+					const SystemType * _system;
+
+					// for iteration
+					VectorIteratorType _element_iter; // coordinates iterator
+
+					VectorType _neighbor; // the current neighbor
+					SizeType _position;
+
+				public:
+					friend SelfType;
+
+					HaloNDimElement() = delete;
+
+					HaloNDimElement( const HaloNDimElement & ) = default;
+
+					HaloNDimElement( HaloNDimElement && ) = delete;
+
+					HaloNDimElement( const SystemType & system ) noexcept :
+						_system( &system ),
+						_element_iter( system ),
+						_neighbor( DIMS ),
+						_position( 0 )
+					{
+						std::fill_n( this->_neighbor.begin(), DIMS, 0 );
+					}
+
+					HaloNDimElement & operator=( const HaloNDimElement & ) = default;
+
+					/**
+					 * Get the element as vector coordinates.
+					 */
+					ConstVectorReference get_element() const {
+						return this->_element_iter->get_position();
+					}
+
+					/**
+					 * Get the element as linear coordinates.
+					 */
+					size_t get_element_linear() const {
+						return this->_system->ndim_to_linear( this->_element_iter->get_position() );
+					}
+
+					/**
+					 * Get the neighbor as vector coordinates.
+					 */
+					ConstVectorReference get_neighbor() const {
+						return this->_neighbor;
+					}
+
+					/**
+					 * Get the neighbor as linear coordinates.
+					 */
+					size_t get_neighbor_linear() const {
+						return this->_system->ndim_to_linear( this->_neighbor );
+					}
+
+					/**
+					 * Get the (unique) neighbor number in the system.
+					 */
+					SizeType get_position() const {
+						return this->_position;
+					}
+				};
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = HaloNDimElement;
+				using pointer = const HaloNDimElement *;
+				using reference = const HaloNDimElement &;
+				using difference_type = signed long;
+
+				LinearizedHaloNDimIterator() = delete;
+
+				/**
+				 * Construct a new LinearizedHaloNDimIterator object from the underlying system
+				 * \p system (whose geometry information is used to iterate). The constructed object
+				 * points to the first neighbor of the first element, i.e. the one with vector coordinates
+				 * \a [0,0,...,0].
+				 *
+				 * IF \p system is not valid anymore, then also \c this is not.
+				 */
+				LinearizedHaloNDimIterator( const SystemType & system ) noexcept :
+					_point( system ),
+					_neighbors_subspace( DIMS, system.halo() + 1 ),
+					_neighbors_start( DIMS ),
+					_neighbor_iter( this->_neighbors_subspace ),
+					_neighbor_end( VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace ) )
+				{
+					std::fill_n( this->_neighbors_start.begin(), DIMS, 0 );
+				}
+
+				LinearizedHaloNDimIterator( const SelfType & ) = default;
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				bool operator!=( const SelfType & other ) const {
+					return this->_point._position != other._point._position; // use linear coordinate
+				}
+
+				reference operator*() const {
+					return this->_point;
+				}
+
+				pointer operator->() const {
+					return &( this->_point );
+				}
+
+				/**
+				 * Tells whether the current element has more neighbor available (on which the user
+				 * has not iterated yet).
+				 */
+				bool has_more_neighbours() const {
+					return this->_neighbor_iter != this->_neighbor_end;
+				}
+
+				/**
+				 * Moves \c this to point to the next neighbor (if any, exception otherwise).
+				 *
+				 * Does \b not advance the element, which should be done manually via #next_element().
+				 */
+				void next_neighbour() {
+					if( ! has_more_neighbours() ) {
+						throw std::out_of_range( "the current element has no more neighbors" );
+					}
+					++( this->_neighbor_iter );
+					this->on_neighbor_iter_update();
+					this->_point._position++;
+				}
+
+				/**
+				 * Tells whether the system has more elements.
+				 */
+				bool has_more_elements() const {
+					return this->_point.get_element_linear() != ( this->_point._system )->base_system_size();
+				}
+
+				/**
+				 * Moves \c this to point to the next element, setting the neighbor as the first one.
+				 */
+				void next_element() {
+					if( ! has_more_elements() ) {
+						throw std::out_of_range( "the system has no more elements" );
+					}
+					size_t num_neighbours = this->_neighbors_subspace.system_size();
+					size_t neighbour_position_offset =
+						this->_neighbors_subspace.ndim_to_linear( this->_neighbor_iter->get_position() );
+					++( this->_point._element_iter );
+					this->on_element_advance();
+					this->_point._position -= neighbour_position_offset;
+					this->_point._position += num_neighbours;
+				}
+
+				/**
+				 * Moves \c this to point to the next neighbor, also advancing the element if needed.
+				 */
+				SelfType & operator++() noexcept {
+					++( this->_neighbor_iter );
+					if( ! has_more_neighbours() ) {
+						++( this->_point._element_iter );
+						this->on_element_advance();
+
+					} else {
+						this->on_neighbor_iter_update();
+					}
+					this->_point._position++;
+					return *this;
+				}
+
+				/**
+				 * Moves \c this ahead of \p offste neighbors, also advancing the element if necessary.
+				 */
+				SelfType & operator+=( size_t offset ) {
+					if( offset == 1UL ) {
+						return this->operator++();
+					}
+					const size_t final_position = this->_point._position + offset;
+					if( final_position > this->_point._system->halo_system_size() ) {
+						throw std::range_error( "neighbor linear value beyond system" );
+					}
+					VectorType final_element( DIMS );
+					size_t neighbor_index =
+						this->_point._system->neighbour_linear_to_element( final_position, final_element );
+
+					this->_point._element_iter = VectorIteratorType( *this->_point._system, final_element.cbegin() );
+					this->_point._position = final_position;
+
+					this->on_element_update();
+					this->_neighbors_subspace.linear_to_ndim( neighbor_index, final_element );
+
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_subspace, final_element.cbegin() );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace );
+					this->on_neighbor_iter_update();
+
+					return *this;
+				}
+
+				/**
+				 * Returns the difference between \c this and \p other in the linear space of neighbors,
+				 * i.e. how many times \p other must be advanced in order to point to the same neighbor of \c this.
+				 *
+				 * It throws if the result cannot be stored as a difference_type variable.
+				 */
+				difference_type operator-( const SelfType & other ) const {
+					return grb::utils::compute_signed_distance< difference_type, SizeType >( _point.get_position(),
+						other._point.get_position() );
+				}
+
+				/**
+				 * Utility to build an iterator to the end of the system \p system.
+				 *
+				 * The implementation depends on the logic of operator++.
+				 */
+				static SelfType make_system_end_iterator( const SystemType & system ) {
+					SelfType result( system );
+					// go to the very first point outside of space
+					result._point._element_iter = VectorIteratorType::make_system_end_iterator( system );
+					result.on_element_advance();
+					result._point._position = system.halo_system_size();
+					return result;
+				}
+
+			private:
+				HaloNDimElement _point;
+				LinearizedNDimSystem< SizeType, VectorType > _neighbors_subspace;
+				VectorType _neighbors_start;
+				VectorIteratorType _neighbor_iter; // iterator in the sub-space of neighbors (0-based)
+				VectorIteratorType _neighbor_end;
+
+				/**
+				 * To be called when the iterator pointing to the neighbor is updated in order to update
+				 * the actual neighbor's coordinates.
+				 */
+				inline void on_neighbor_iter_update() {
+					for( size_t i = 0; i < DIMS; i++ ) {
+						this->_point._neighbor[ i ] = this->_neighbors_start[ i ]
+							+ this->_neighbor_iter->get_position()[ i ];
+					}
+				}
+
+				/**
+				 * To be called after the iterator pointing to the element is updated in order to
+				 * reset the information about the neighbor.
+				 */
+				void on_element_update() {
+					// reset everything
+					VectorType neighbors_range( DIMS );
+					this->_point._system->compute_neighbors_range( this->_point._element_iter->get_position(),
+						this->_neighbors_start, neighbors_range );
+					// re-target _neighbors_subspace
+					this->_neighbors_subspace.retarget( neighbors_range );
+				}
+
+				/**
+				 * To be called after the iterator pointing to the element is updated in order to update
+				 * all information about the neighbor, like iterator, sorrounding halo and coordinates.
+				 */
+				void on_element_advance() {
+					this->on_element_update();
+
+					this->_neighbor_iter = VectorIteratorType( this->_neighbors_subspace );
+					this->_neighbor_end = VectorIteratorType::make_system_end_iterator( this->_neighbors_subspace );
+
+					this->on_neighbor_iter_update();
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
new file mode 100644
index 000000000..34e16069d
--- /dev/null
+++ b/include/graphblas/utils/multigrid/linearized_halo_ndim_system.hpp
@@ -0,0 +1,559 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file linearized_halo_ndim_system.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedHaloNDimSystem.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <vector>
+#ifdef _DEBUG
+#include <iostream>
+#endif
+
+#include "array_vector_storage.hpp"
+#include "dynamic_vector_storage.hpp"
+#include "linearized_halo_ndim_iterator.hpp"
+#include "linearized_ndim_system.hpp"
+#include "ndim_vector.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Structure to represent an N-dimensional space (or \a system) of given sizes and to
+			 * iterate on both the \a elements of the N-dimensional system and the N-dimensional
+			 * \a neighbors of each element within a given \p halo. This facility takes into account
+			 * the various cases where the element is at the corner, edge or face of the N-dimensional
+			 * system, to which different neighbors correspond. Both elements and their neighbors are
+			 * vectors in the N-dimensional system and as such described via both N-dimensional coordinates
+			 * and a linear coordinate.
+			 *
+			 * This structure returns the number of elements of the underlying N-dimensional system
+			 * (the \a base system) via #base_system_size() and the total sum of neighbors of all
+			 * system elements via #halo_system_size().
+			 *
+			 * The peculiar feature of this structure is the method #neighbour_linear_to_element(), to translate
+			 * a neighbor index (i.e. a value from \a 0 to #halo_system_size(), uniquely identifying an element
+			 * as neighbor of an element) to the N-dimensional coordinates of the corresponding elements in a time
+			 * that is constant with respect to the input value (it depends on \p DIMS and the halo size).
+			 * This facility allows the iterators of a LinearizedNDimSystem to be random-access: when advancing
+			 * an iterator by an \a offset via the \a += method, the logic:
+			 *
+			 * - increments the index of the current neighbor (stored inside the iterator) by \a offset, thus
+			 *  computing the index of the destination neighbor (constant time)
+			 * - translates the index of the destination neighbor to its base element's coordinates via
+			 *  #neighbour_linear_to_element() (constant time)
+			 *
+			 * The same method also returns the index of the destination neighbor within the sub-space of the base
+			 * element's neighbors: hence, the logic can compute in constant time the destination base element
+			 * and its destination neighbor. The constant time of this translation is achieved by pre-computing
+			 * the number of neighbors for each element along each dimension: for example, inner elements in
+			 * a 3D mesh with halo 1 have 27 neighbors. Thus, it suffices in principle to divide the neighbor
+			 * index by 27 to compute the base element of a neighbor. Care must be taken for elements at the
+			 * sides of each dimension: for example, a corner element on a face has 8 neighbors, while a corner
+			 * element in an iternal slab (a 2D "plane" in a 3D mesh) has 12 neighbors. The pre-computed
+			 * information and the logic also account for this.
+			 *
+			 * @tparam DIMS number of dimensions of the system
+			 * @tparam SizeType type storing the system sizes and offsets
+			 */
+			template<
+				size_t DIMS,
+				typename SizeType
+			> class LinearizedHaloNDimSystem :
+				public LinearizedNDimSystem< SizeType, ArrayVectorStorage< DIMS, SizeType > > {
+			public:
+				using VectorType = ArrayVectorStorage< DIMS, SizeType >;
+				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
+				using SelfType = LinearizedHaloNDimSystem< DIMS, SizeType >;
+				using BaseType = LinearizedNDimSystem< SizeType, VectorType >;
+				using Iterator = LinearizedHaloNDimIterator< DIMS, SizeType >;
+
+				/**
+				 * Construct a new LinearizedHaloNDimSystem object with given sizes and halo.
+				 *
+				 * The size of \p sizes must be exactly \p DIMS. Each size must be so that there is at least
+				 * en element in the system with full halo neighors, i.e. for each size \a s
+				 * <em>s >= 2 * halo + 1</em> (otherwise an exception is thrown).
+				 */
+				LinearizedHaloNDimSystem(
+					ConstVectorStorageType sizes,
+					SizeType halo
+				) :
+					BaseType( sizes.cbegin(), sizes.cend() ),
+					_halo( halo )
+				{
+					for( SizeType __size : sizes ) {
+						if( __size < halo + 1 ) {
+							throw std::invalid_argument(
+								std::string( "the halo (" + std::to_string( halo )
+								+ std::string( ") goes beyond a system size (" )
+								+ std::to_string( __size ) + std::string( ")" ) ) );
+						}
+					}
+
+					this->_system_size = init_neigh_to_base_search(
+						this->get_sizes(), _halo, this->_dimension_limits );
+					assert( this->_dimension_limits.size() == DIMS );
+				}
+
+				LinearizedHaloNDimSystem() = delete;
+
+				LinearizedHaloNDimSystem( const SelfType & ) = default;
+
+				LinearizedHaloNDimSystem( SelfType && ) = delete;
+
+				~LinearizedHaloNDimSystem() noexcept {}
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				SelfType & operator=( SelfType && ) = delete;
+
+				/**
+				 * Builds an iterator from the beginning of the system, i.e. from vector \a [0,0,...,0].
+				 * The iterator iterates on each neighbor and allows iterating on each element and on
+				 * its neighbors.
+				 */
+				Iterator begin() const {
+					return Iterator( *this );
+				}
+
+				/**
+				 * Build an iterator marking the end of the system; it should not be accessed.
+				 */
+				Iterator end() const {
+					return Iterator::make_system_end_iterator( *this );
+				}
+
+				/**
+				 * Returns the size of the entire system, i.e. the number of neighbors of all elements.
+				 */
+				size_t halo_system_size() const {
+					return this->_system_size;
+				}
+
+				/**
+				 * Returns the size of the base system, i.e. number of elements (not considering neighbors).
+				 */
+				size_t base_system_size() const {
+					return this->BaseType::system_size();
+				}
+
+				/**
+				 * Returns the halo size.
+				 */
+				size_t halo() const {
+					return this->_halo;
+				}
+
+				/**
+				 * Computes the first neighbor and the size of the N-dimensional range of neighbors
+				 * around the given element's coordinates for the system \c this.
+				 *
+				 * @param[in] element_coordinates coordinates of the element to iterate around
+				 * @param[out] neighbors_start first neighbor around \p element_coordinates to iterate from
+				 * @param[out] neighbors_range vector of halos around \p element_coordinates;
+				 * if \p element_coordinates is an inner point, all values equal #halo(), they are smaller
+				 * otherwise (on corner, edge, or face).
+				 */
+				void compute_neighbors_range(
+					const VectorType & element_coordinates,
+					VectorType & neighbors_start,
+					VectorType & neighbors_range
+				) const noexcept {
+					compute_first_neigh_and_range( this->get_sizes(),
+						this->_halo, element_coordinates, neighbors_start, neighbors_range );
+				}
+
+				/**
+				 * Maps the linear index \p neighbor_linear of a neighbor to the vector \p base_element_vector
+				 * of the corresponding element \p neighbor_linear is neighbor of, and returns the neighbor's
+				 * number within the sub-space of \p base_element_vector 's neighbors.
+				 *
+				 * @param[in] neighbor_linear linear coordinate of input neighbor
+				 * @param[out] base_element_vector vector of coordinates that identify which element
+				 *  \p neighbor_linear is neighbor of
+				 * @return size_t the neighbor number w.r.t. to the corresponding element: if \a e is the system
+				 * element \p neighbor_linear is neighbor of and \a e has \a n neighbors, then the return value
+				 * \a 0<=i<n is the the index of \p neighbor_linear among \a e's neighbors, computed w.r.t. the
+				 * iteration order.
+				 */
+				size_t neighbour_linear_to_element(
+					SizeType neighbor_linear,
+					VectorType & base_element_vector
+				) const noexcept {
+					return map_neigh_to_base_and_index( this->get_sizes(), this->_system_size,
+						this->_dimension_limits, this->_halo, neighbor_linear, base_element_vector );
+				}
+
+			private:
+				const SizeType _halo;
+				std::vector< NDimVector< SizeType, SizeType,
+					DynamicVectorStorage< SizeType > > > _dimension_limits;
+				size_t _system_size;
+
+				/**
+				 * Computes the total number of neighbors along a certain dimension and configuration by accumulating
+				 * the neighbors along the smaller dimensions.
+				 *
+				 * The logic uses this buffer to iterate over the configurations of
+				 * the previous dimension. Example: to compute in 3D the neighbors of an inner row of a face
+				 * (configuration <em>[0,1,0]</em>, dimension 1 - y), the logic needs the neighbors of
+				 * en edge element and of an element internal to a face of the mesh, corresponding to
+				 * the configurations <em>[0,1,0]</em> and <em>[1,1,0]</em>, respectively. Hence, the caller
+				 * must initialize a buffer with the values <em>[X,1,0]</em> (\a X meaning don't care) and pass
+				 * as \p coords_buffer the pointer to the first position (the \a X ), where this function
+				 * will write all possible values <em>[0, \p halo )</em> to access the number of neighbors
+				 * of the configurations of the previous dimension via \p prev_neighs and accumulate them.
+				 *
+				 * @param[in] prev_neighs neighbors in the configurations of the previous dimension
+				 * @param[in,out] coords_buffer pointer to the first position of the configuration buffer
+				 *  for this dimension
+				 * @param[in] halo halo size
+				 * @param[in] local_size size (i.e., number of elements) along the current dimension,
+				 *  including the edges
+				 * @return size_t the total number of neighbors for this configuration and this dimension
+				 */
+				static size_t accumulate_dimension_neighbours(
+					const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & prev_neighs,
+					SizeType * coords_buffer,
+					size_t halo,
+					size_t local_size
+				) {
+					size_t neighs = 0;
+					size_t h = 0;
+					for( ; h < halo && local_size > 1; h++ ) {
+						*coords_buffer = h;
+
+						const size_t local_neighs = prev_neighs.at( coords_buffer );
+						neighs += 2 * local_neighs; // the 2 sides
+						local_size -= 2;
+					}
+					*coords_buffer = h;
+					neighs += local_size * prev_neighs.at( coords_buffer ); // innermost elements
+					return neighs;
+				}
+
+				/**
+				 * Computes the number of neighbors for each configuration along dimension 0:
+				 * corner, edge, face, inner element.
+				 *
+				 * Example: in a 3D system with <em>\p halo = 1</em>, the configurations along dimension 0 are 8:
+				 * 1. z axis - face:
+				 *   1. y axis - top row: corner element (8 neighbors), edge element (12 neighbors)
+				 *   2  y axis - inner row: edge element (12 neighbors), face inner element (18 neighbors)
+				 * 2. z axis - inner slab:
+				 *   1. y axis - top row: edge element (12 neighbors), face inner element (18 neighbors)
+				 *   2  y axis - inner row: face inner element (18 neighbors), inner element (27 neighbors)
+				 *
+				 * @param[in] halo halo size
+				 * @param[out] config_neighbors the storage object for each configuration
+				 */
+				static void compute_dim0_neighbors(
+					size_t halo,
+					NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & config_neighbors
+				) {
+					using it_type = typename NDimVector< SizeType, SizeType,
+						DynamicVectorStorage< SizeType > >::DomainIterator;
+					it_type end = config_neighbors.domain_end();
+					for( it_type it = config_neighbors.domain_begin(); it != end; ++it ) {
+						size_t res = 1;
+						for( size_t h : it->get_position() )
+							res *= ( h + 1 + halo );
+						config_neighbors.at( it->get_position() ) = res;
+					}
+				}
+
+				/**
+				 * Initializes the search space of neighbors for the <neighbor linear> -> <base vector> translation.
+				 *
+				 * This function populates an std::vector<> with the number of neighors for each dimension
+				 * and each configuration (corner, edge, face, inner).
+				 * Along each dimension \a d, it stores an \a n -dimensional vector
+				 * NDimVector<SizeType,SizeType,DynamicVectorStorage< SizeType>> (<em>n = 2 ^ d</em>) with all
+				 * possible numbers of neighbors along that dimension, depending on the position of the element
+				 * (corner, edge, face, inner volume); for example, for 3 dimensions:
+				 *  - dimension 2 (z axis) moves along "slabs" of a 3D systems, where the total number of neighbors
+				 *   depends on whether the slab is a face of the mesh of an internal slab (2 possible configurations:
+				 *   face slabs or inner slabs)
+				 *  - dimension 1 (y axis) moves along "rows" within each slab, whose total number of neighbors
+				 *	  depends on whether the row is at the extreme sides (top or bottom of the face) or inside;
+				 *   in turn, each type of slab has different geometry (face slabs comprise mesh corners, edges and
+				 * 	 faces, while inner slabs comprise edges, faces and inner elements), thus resulting in
+				 *   2*2 different configurations of dimension-1 total neighbors
+				 *  - dimension 0 (x axis) moves along "column" elements within each row, where the first (or last)
+				 *   column has a different number of neighbors than the inner ones; here again are two configuration
+				 *   for each dimension-1 configuration, leading to a total of 8 dimension-1 configurations
+				 * Within each dimension \a d, each configuration (as per the above explanation) can be identified
+				 * via a vector of <em>N - d</em> coordinates; to limit the data storage, every dimension stores the
+				 * total number of neighbors only at the first side and inside, since the second side  is identical
+				 * to the first one: for example, along the z axis the first and last slab (those on the two extremes)
+				 * have the same size, and one only is stored. Therefore, with <em>halo = 1</em> a vector identifying
+				 * a configuration is composed only of 0s and 1s. For example, the vector <em>[0,1,0]</em> identifies:
+				 * - rightmost 0 (z axis): first (or last) slab, i.e. face slab
+				 * - (middle) 1 (y axis): inner row
+				 * - leftmost 0 (x axis): first (or last) element, i.e. on the edge of the mesh
+				 * In a 3D space with <em>halo = 1</em>, this element has 12 neighbors (it is on the edge of a face).
+				 *
+				 * @paragraph[in] vector of sizes sizes of the N-dimensional system
+				 * @param[in] halo halo size
+				 * @param[out] dimension_limits the std::vector<> with the neighbors information for each dimension
+				 *  and each configuration
+				 * @return size_t the number of neighbors of the entire system
+				 */
+				static size_t init_neigh_to_base_search(
+					typename LinearizedNDimSystem< SizeType, ArrayVectorStorage< DIMS, SizeType >
+						>::ConstVectorReference sizes,
+					size_t halo,
+					std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > > & dimension_limits
+				) {
+					using nd_vec = NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >;
+					using nd_vec_iterator = typename nd_vec::DomainIterator;
+
+					std::vector< size_t > halo_sizes( DIMS, halo + 1 );
+					dimension_limits.emplace_back( halo_sizes );
+					// initialize values
+					compute_dim0_neighbors( halo, dimension_limits[ 0 ] );
+					for( size_t i = 1; i < DIMS; i++ ) {
+						std::vector< size_t > halos( DIMS - i, halo + 1 );
+						dimension_limits.emplace_back( halos );
+					}
+
+					std::array< SizeType, DIMS > prev_coords_buffer; // store at most DIMS values
+					SizeType * const prev_coords = prev_coords_buffer.data();
+					SizeType * const second = prev_coords + 1; // store previous coordinates from second position
+					for( size_t dimension = 1; dimension < DIMS; dimension++ ) {
+						const nd_vec & prev_neighs { dimension_limits[ dimension - 1 ] };
+						nd_vec & current_neighs { dimension_limits[ dimension ] };
+
+						nd_vec_iterator end = current_neighs.domain_end();
+						for( nd_vec_iterator it = current_neighs.domain_begin(); it != end; ++it ) {
+							typename nd_vec::ConstDomainVectorReference current_halo_coords = it->get_position();
+
+							std::copy( it->get_position().cbegin(), it->get_position().cend(), second );
+							size_t local_size = sizes[ dimension - 1 ];
+							const size_t neighs = accumulate_dimension_neighbours( prev_neighs,
+								prev_coords, halo, local_size );
+							current_neighs.at( current_halo_coords ) = neighs;
+						}
+					}
+					return accumulate_dimension_neighbours( dimension_limits[ DIMS - 1 ],
+						prev_coords, halo, sizes.back() );
+				}
+
+				/**
+				 * For the given system (with sizes \p _system_sizes), the given halo size \p halo,
+				 * the given element's coordinates \p element_coordinates, computes the coordinates
+				 * of the first neighbor of \p element_coordinates into \p neighbors_start (within the main system)
+				 * and the range of neighbors of \p element_coordinates, i.e. the sub-space of neighbors of
+				 * \p element_coordinates; hence, \p neighbors_range stores at most <em>2 *<\em> \p halo
+				 * <em> + 1</em> per coordinate.
+				 *
+				 * @param[in] _system_sizes sizes of the N-dimensional system
+				 * @param[in] halo halo size
+				 * @param[in] element_coordinates coordinates of the considered element
+				 * @param[out] neighbors_start stores the (absolute) coordinates of the first neighbor
+				 *  of \p element_coordinates
+				 * @param[out] neighbors_range stores the range of neighbors around \p element_coordinates
+				 */
+				static void compute_first_neigh_and_range(
+					const ArrayVectorStorage< DIMS, SizeType > & _system_sizes,
+					const SizeType halo,
+					const ArrayVectorStorage< DIMS, SizeType > & element_coordinates,
+					ArrayVectorStorage< DIMS, SizeType > & neighbors_start,
+					ArrayVectorStorage< DIMS, SizeType > & neighbors_range
+				) {
+					for( SizeType i = 0; i < DIMS /* - 1*/; i++ ) {
+						const SizeType start = element_coordinates[ i ] <= halo ? 0 :
+							element_coordinates[ i ] - halo;
+						const SizeType end = std::min( element_coordinates[ i ] + halo, _system_sizes[ i ] - 1 );
+						neighbors_start[ i ] = start;
+						neighbors_range[ i ] = end - start + 1;
+					}
+				}
+
+#ifdef _DEBUG
+				template< typename IterType >
+				static std::ostream & print_sequence( IterType begin, IterType end ) {
+					for( ; begin != end; ++begin ) {
+						std::cout << *begin << ' ';
+					}
+					return std::cout;
+				}
+#endif
+
+				/**
+				 * Maps a neighbor's linear coordinate \p neighbor_linear to the element \p element_vector it is
+				 * neighbor of and also returns the neighbor index of \p neighbor_linear within the sub-space
+				 * of \p element_vector's neighbors.
+				 *
+				 * @param[in] sizes main system sizes along all dimensions
+				 * @param[in] system_size total size of the neighbors system, i.e. the total number of neighbors
+				 * @param[in] neighbors_per_dimension along each dimension \a d, it stores an \a n -dimensional vector
+				 *  NDimVector<SizeType,SizeType,DynamicVectorStorage< SizeType>> (<em>n = 2 ^ d</em>) with all
+				 *  possible numbers of neighbors along that dimension, depending on the position of the element
+				 *  (corner, edge, face, inner volume)
+				 * @param[in] halo halo size
+				 * @param[in] neighbor_linear linear coordinate of the neighbor
+				 * @param[out] element_vector coordinates vector representing the element \p neighbor_linear is
+				 *  neighbor of
+				 * @return size_t the index of the neighbor within the element's neighbors
+				 */
+				static size_t map_neigh_to_base_and_index(
+					const std::array< SizeType, DIMS > & sizes,
+					size_t system_size,
+					const std::vector< NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > >
+						> & neighbors_per_dimension,
+					SizeType halo,
+					SizeType neighbor_linear,
+					ArrayVectorStorage< DIMS, SizeType > & element_vector
+				) {
+					if( neighbor_linear > system_size ) {
+						throw std::invalid_argument( "neighbor number ( " + std::to_string( neighbor_linear )
+							+ " ) >= system size ( " + std::to_string( system_size ) + " )" );
+					}
+					ArrayVectorStorage< DIMS, SizeType > configuration( DIMS );
+#ifdef _DEBUG
+					size_t * const halo_coords_end = configuration.data() + DIMS;
+#endif
+					std::fill_n( configuration.begin(), DIMS, 0 );
+
+					for( size_t _dim = DIMS; _dim > 0; _dim-- ) {
+						// each iteration looks for the base element along a dimension via the number of neighbors
+						// each element has: once previous_neighs reaches neighbor_linear, the corresponding
+						// base element is found; if the control reaches the end, this means it must explore
+						// the following dimension to find the base element: this is why dimensions are explored
+						// starting from the highest, because moving along a higher dimension means "skipping"
+						// more neighbors; then the search "zooms in"to a smaller dimension to find the base element
+
+						// start from highest dimension
+						const size_t dimension = _dim - 1;
+						// how many elements along this dimension
+						const size_t dimension_size = sizes[ dimension ];
+						// configurations of neighbors along this dimension
+						// (e.g., corner, edge; or edge, inner element)
+						const NDimVector< SizeType, SizeType, DynamicVectorStorage< SizeType > > & neighbors =
+							neighbors_per_dimension[ dimension ];
+
+						// coordinate to modify to identify each configuration
+						SizeType * const halo_coords_begin = configuration.data() + dimension;
+#ifdef _DEBUG
+						std::cout << "DIMENSION " << dimension << std::endl << "- setup - neighbour "
+							<< neighbor_linear << std::endl << "\thalo : ";
+						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+#endif
+						size_t h = 0; // configuration type along this dimension
+						size_t previous_neighs = 0;
+						*halo_coords_begin = h;
+						// account for neighbors in the first elements along the dimension, within halo distance:
+						// these elements have a number of neighbors that depends on the distance h
+						// and on the configuration
+						size_t halo_max_neighs = neighbors.at( halo_coords_begin );
+						while( h < halo && neighbor_linear >= previous_neighs + halo_max_neighs ) {
+							h++;
+							*halo_coords_begin = h;
+							previous_neighs += halo_max_neighs;
+							halo_max_neighs = neighbors.at( halo_coords_begin );
+						}
+#ifdef _DEBUG
+						std::cout << "- initial halo - neighbour " << neighbor_linear
+							<< std::endl << "\th " << h << std::endl << "\thalo : ";
+						print_sequence( halo_coords_begin, halo_coords_end ) << std::endl;
+						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+						if( h < halo ) {
+							// we have already counted enough neighbors: neighbor_linear is thus a neighbor
+							// of one of the first (< halo) elements along this dimension: go to next dimension
+							element_vector[ dimension ] = h;
+							neighbor_linear -= previous_neighs;
+#ifdef _DEBUG
+							std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+							continue;
+						}
+						// saturation occurred: the base element is beyond the halo: go on with the search
+
+						// inner elements have the same number of neighbors halo_max_neighs: compute
+						// the base element via division
+						const size_t distance_from_halo = ( neighbor_linear - previous_neighs ) / halo_max_neighs;
+#ifdef _DEBUG
+						std::cout << "- before middle elements - neighbour " << neighbor_linear << std::endl
+								  << "\tprevious_neighs " << previous_neighs << std::endl
+								  << "\thalo_max_neighs " << halo_max_neighs << std::endl
+								  << "\tdistance_from_halo " << distance_from_halo << std::endl
+								  << "\tdimension_size " << dimension_size << std::endl;
+#endif
+						if( distance_from_halo < dimension_size - 2 * halo ) {
+							// the base element is one of the internal elements along this dimension:
+							// hence return its diatance from the halo + the halo itself (= distance from
+							// beginning of the space)
+							element_vector[ dimension ] = distance_from_halo + halo;
+							neighbor_linear -= ( previous_neighs + distance_from_halo * halo_max_neighs );
+#ifdef _DEBUG
+							std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+							continue;
+						}
+						// base element is even beyond inner elements, it might be among the elements at the end,
+						// which also have different numbers of neighbors (specular to initial elements)
+						previous_neighs += ( dimension_size - 2 * halo ) * halo_max_neighs;
+#ifdef _DEBUG
+						std::cout << "- after middle elements -neighbour " << neighbor_linear << std::endl;
+						std::cout << "\tprevious_neighs " << previous_neighs << std::endl;
+						std::cout << "\thalo_max_neighs " << halo_max_neighs << std::endl;
+#endif
+						// look for base the element at the end of the dimension: specular search to beginning,
+						// just with h decreasing
+						h = halo - 1;
+						*halo_coords_begin = h;
+						halo_max_neighs = neighbors.at( halo_coords_begin );
+						while( h > 0 && neighbor_linear >= previous_neighs + halo_max_neighs ) {
+							h--;
+							*halo_coords_begin = h;
+							previous_neighs += halo_max_neighs;
+							halo_max_neighs = neighbors.at( halo_coords_begin );
+						}
+						neighbor_linear -= previous_neighs;
+#ifdef _DEBUG
+						std::cout << "- final halo - neighbour " << neighbor_linear << std::endl;
+						std::cout << "\tadding h " << h << " previous_neighs " << previous_neighs << std::endl;
+#endif
+						// ( dimension_size - 1 ) because coordinates are 0-based and neighbor
+						// is "inside" range [ previous_neighs, previous_neighs + halo_max_neighs ]
+						element_vector[ dimension ] = dimension_size - 1 - h;
+#ifdef _DEBUG
+						std::cout << "end neighbour " << neighbor_linear << std::endl;
+#endif
+					}
+					return neighbor_linear;
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_LINEARIZED_HALO_NDIM_SYSTEM
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
new file mode 100644
index 000000000..a4ae8af5e
--- /dev/null
+++ b/include/graphblas/utils/multigrid/linearized_ndim_iterator.hpp
@@ -0,0 +1,241 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file linearized_ndim_iterator.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of LinearizedNDimIterator.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
+
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <stdexcept>
+#include <type_traits>
+
+#include <graphblas/utils/iterators/utils.hpp>
+
+#include "array_vector_storage.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			// forward declaration for default
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimSystem;
+
+			/**
+			 * Iterator object couled to a LinearizedNDimSystem: each object points to a vector
+			 * in the creating LinearizedNDimSystem#dimensions()-dimensions space, to which also a
+			 * linear position is associated; both the vector and the linear position can be retrieved
+			 * via the \a -> method.
+			 *
+			 * It meets the requirements of a random access iterator.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimIterator {
+			public:
+				using VectorType = InternalVectorType;
+				using LinNDimSysType = LinearizedNDimSystem< SizeType, VectorType >;
+				using ConstVectorReference = const VectorType &;
+				using SelfType = LinearizedNDimIterator< SizeType, InternalVectorType >;
+
+				/**
+				 * Structure describing a couple vector/linear coordinate: the vector
+				 * can be obtained via #get_position() while the linear coordinate via
+				 * #get_linear_position().
+				 */
+				struct NDimPoint {
+				private:
+					const LinNDimSysType * system; // pointer because of copy assignment
+					VectorType coords;
+
+				public:
+					friend SelfType;
+
+					NDimPoint() = delete;
+
+					NDimPoint( const NDimPoint & ) = default;
+
+					NDimPoint( NDimPoint && ) = delete;
+
+					NDimPoint( const LinNDimSysType & _system ) noexcept :
+						system( &_system ),
+						coords( _system.dimensions() )
+					{
+						std::fill_n( this->coords.begin(), _system.dimensions(), 0 );
+					}
+
+					NDimPoint & operator=( const NDimPoint & ) = default;
+
+					inline ConstVectorReference get_position() const {
+						return coords;
+					}
+
+					size_t get_linear_position() const {
+						return system->ndim_to_linear( coords );
+					}
+				};
+
+				// interface for std::random_access_iterator
+				using iterator_category = std::random_access_iterator_tag;
+				using value_type = NDimPoint;
+				using pointer = const value_type *;
+				using reference = const value_type &;
+				using difference_type = signed long;
+
+				/**
+				 * Construct a new LinearizedNDimIterator object from the original LinNDimSysType
+				 * object, storing the information about system dimensionality and sizes. The referenced
+				 * vector is the first one in the system, i.e. with all coordinates being \a 0.
+				 *
+				 * If \p _system is not a valid object anymore, all iterators created from it are also
+				 * not valid.
+				 */
+				LinearizedNDimIterator( const LinNDimSysType & _system ) noexcept : _p( _system ) {}
+
+				/**
+				 * Construct a new LinearizedNDimIterator object from the original LinNDimSysType
+				 * object, storing the information about system dimensionality and sizes. The referenced
+				 * vector is initialized with the coordinates referenced via the iterator \p begin,
+				 * which should have at least \p _system.dimensions() valid successors.
+				 *
+				 * If \p _system is not a valid object anymore, all iterators created from it are also
+				 * not valid.
+				 */
+				template< typename IterT > LinearizedNDimIterator(
+					const LinNDimSysType & _system,
+					IterT begin
+				) noexcept :
+					_p( _system )
+				{
+					std::copy_n( begin, _system.dimensions(), this->_p.coords.begin() );
+				}
+
+				LinearizedNDimIterator() = delete;
+
+				LinearizedNDimIterator( const SelfType & original ) : _p( original._p ) {}
+
+				SelfType & operator=( const SelfType & original ) = default;
+
+				~LinearizedNDimIterator() {}
+
+				/**
+				 * Moves to the next vector in the multi-dimensional space, corresponding to
+				 * advancing the linear coordinate by 1.
+				 */
+				SelfType & operator++() noexcept {
+					bool rewind = true;
+					// rewind only the first N-1 coordinates
+					for( size_t i = 0; i < this->_p.system->dimensions() - 1 && rewind; i++ ) {
+						SizeType & coord = this->_p.coords[ i ];
+						// must rewind dimension if we wrap-around
+						SizeType plus = coord + 1;
+						rewind = plus >= this->_p.system->get_sizes()[ i ];
+						coord = rewind ? 0 : plus;
+					}
+					// if we still have to rewind, increment the last coordinate, which is unbounded
+					if( rewind ) {
+						this->_p.coords[ this->_p.system->dimensions() - 1 ]++;
+					}
+					return *this;
+				}
+
+				/**
+				 * Moves \p _offset vectors ahead in the multi-dimensional space, corresponding to
+				 * advancing the linear coordinate by \p _offset.
+				 *
+				 * If the destination vector is outside of the system (i.e. the corresponding
+				 * linear coordinate is beyond the underlying LinearizedNDimSystem#system_size()),
+				 * an exception is thrown.
+				 */
+				SelfType & operator+=( size_t offset ) {
+					size_t linear = _p.get_linear_position() + offset;
+					if( linear > _p.system->system_size() ) {
+						throw std::invalid_argument( "increment is too large" );
+					}
+					if( offset == 1 ) {
+						return operator++();
+					}
+					_p.system->linear_to_ndim( linear, _p.coords );
+					return *this;
+				}
+
+				/**
+				 * Returns the difference between \p _other and \c this in the linear space.
+				 *
+				 * It throws if the result cannot be stored as a difference_type variable.
+				 */
+				difference_type operator-( const SelfType & other ) const {
+					return grb::utils::compute_signed_distance< difference_type, SizeType >(
+						_p.get_linear_position(), other._p.get_linear_position() );
+				}
+
+				reference operator*() const {
+					return this->_p;
+				}
+
+				pointer operator->() const {
+					return &( this->_p );
+				}
+
+				bool operator!=( const SelfType & o ) const {
+					const size_t dims = this->_p.system->dimensions();
+					if( dims != o._p.system->dimensions() ) {
+						throw std::invalid_argument( "system sizes do not match" );
+					}
+					bool equal = true;
+					for( size_t i = 0; i < dims && equal; i++ ) {
+						equal &= ( this->_p.coords[ i ] == o._p.coords[ i ] );
+					}
+					return ! equal;
+				}
+
+				/**
+				 * Facility to build an end iterator.
+				 *
+				 * Its implementation depending on the logic in operator++.
+				 */
+				static SelfType make_system_end_iterator( const LinNDimSysType & _system ) {
+					// fill with 0s
+					SelfType iter( _system );
+					size_t last = iter->system->dimensions() - 1;
+					// store last size in last position
+					iter._p.coords[ last ] = iter->system->get_sizes()[ last ];
+					return iter;
+				}
+
+			private:
+				NDimPoint _p;
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_ITERATOR
diff --git a/include/graphblas/utils/multigrid/linearized_ndim_system.hpp b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
new file mode 100644
index 000000000..c4b62707a
--- /dev/null
+++ b/include/graphblas/utils/multigrid/linearized_ndim_system.hpp
@@ -0,0 +1,287 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file linearized_ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of \p LinearizedNDimSystem.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "linearized_ndim_iterator.hpp"
+#include "ndim_system.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Extends an NDimSystem by linearizing it, i.e. it provides facilities to map a vector in
+			 * NDimSystem#dimensions() dimensions to a linear value ranging from \a 0 to #system_size() (excluded)
+			 * and vice versa. Such a linearized representation allows user logic to iterate over the system:
+			 * iterators are indeed available via #begin()/#end(). Consecutive system elements along dimension 0
+			 * are mapped to consecutive linear values, while elements consecutive along dimension 1
+			 * are mapped at offset #get_offsets()[1] = #get_sizes()[0], elements along dimension 2
+			 * are mapped at offset #get_offsets()[2] = #get_sizes()[0] * #get_sizes()[0], and so on.
+			 *
+			 * Further facilities are methods to map users' vectors from linear to NDimSystem#dimensions()-dimensional
+			 * or vice versa and also to "retaget" the system, i.e. to represent a system of same dimensionality
+			 * but different sizes; this last feature is a mere performance optimization aimed at
+			 * reusing existing objects instead of deleting them and allocating new memory.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class LinearizedNDimSystem : public NDimSystem< SizeType, InternalVectorType > {
+			public:
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type" );
+
+				using BaseType = NDimSystem< SizeType, InternalVectorType >;
+				using SelfType = LinearizedNDimSystem< SizeType, InternalVectorType >;
+				using VectorType = typename BaseType::VectorType;
+				using VectorReference = typename BaseType::VectorReference;
+				using ConstVectorReference = typename BaseType::ConstVectorReference;
+				using VectorStorageType = typename VectorType::VectorStorageType;
+				using ConstVectorStorageType = typename VectorType::ConstVectorStorageType;
+				using Iterator = LinearizedNDimIterator< SizeType, InternalVectorType >;
+
+				/**
+				 * Construct a new LinearizedNDimSystem object from an iterable range,
+				 * where each iterator's position stores the size along each dimension; example:
+				 * *begin is the size along dimension 0, *(++begin) is the size along dimension 1 ...
+				 */
+				template< typename IterT >
+				LinearizedNDimSystem(
+					IterT begin,
+					IterT end
+				) noexcept :
+					BaseType( begin, end ),
+					_offsets( std::distance( begin, end ) )
+				{
+					this->_system_size = compute_range_product( begin, end, this->_offsets.begin() );
+				}
+
+				/**
+				 * Construct a new LinearizedNDimSystem object with dimensions \p _sizes.size()
+				 * and sizes stored in \p _sizes.
+				 */
+				LinearizedNDimSystem( const std::vector< size_t > & _sizes ) noexcept :
+					LinearizedNDimSystem( _sizes.cbegin(), _sizes.cend() ) {}
+
+				/**
+				 * Construct a new LinearizedNDimSystem object with \p _dimensions dimensions
+				 * and sizes all equal to \p max_value.
+				 */
+				LinearizedNDimSystem(
+					size_t _dimensions,
+					size_t _size
+				) noexcept :
+					BaseType( _dimensions, _size ),
+					_offsets( _dimensions ),
+					_system_size( _dimensions )
+				{
+					SizeType v = 1;
+					for( size_t i = 0; i < _dimensions; i++ ) {
+						this->_offsets[ i ] = v;
+						v *= _size;
+					}
+					this->_system_size = v;
+				}
+
+				LinearizedNDimSystem() = delete;
+
+				LinearizedNDimSystem( const SelfType & original ) = default;
+
+				LinearizedNDimSystem( SelfType && original ) noexcept :
+					BaseType( std::move( original ) ),
+					_offsets( std::move( original._offsets ) ),
+					_system_size( original._system_size )
+				{
+					original._system_size = 0;
+				}
+
+				~LinearizedNDimSystem() {}
+
+				SelfType & operator=( const SelfType & ) = default;
+
+				SelfType & operator=( SelfType && original ) = delete;
+
+				/**
+				 * Computes the size of the system, i.e. its number of elements;
+				 * this corresponds to the product of the sizes along all dimensions.
+				 */
+				inline size_t system_size() const {
+					return this->_system_size;
+				}
+
+				/**
+				 * Get the offsets of the system, i.e. by how many linear elements moving along
+				 * a dimension corresponds to.
+				 */
+				inline ConstVectorReference get_offsets() const {
+					return this->_offsets;
+				}
+
+				/**
+				 * Computes the #dimensions()-dimensions vector the linear value in input corresponds to.
+				 *
+				 * @param[in] linear linear index
+				 * @param[out] output output vector \p linear corresponds to
+				 */
+				void linear_to_ndim(
+					size_t linear,
+					VectorReference output
+				) const {
+					if( linear > this->_system_size ) {
+						throw std::range_error( "linear value beyond system" );
+					}
+					for( size_t _i = this->_offsets.dimensions(); _i > 0; _i-- ) {
+						const size_t dim = _i - 1;
+						const size_t coord = linear / this->_offsets[ dim ];
+						output[ dim ] = coord;
+						linear -= ( coord * this->_offsets[ dim ] );
+					}
+					assert( linear == 0 );
+				}
+
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to \p InternalVectorType and checks whether each value in the input
+				 * vector \p ndim_vector is within the system sizes (otherwise it throws).
+				 */
+				size_t ndim_to_linear_check( ConstVectorReference ndim_vector ) const {
+					return this->ndim_to_linear_check( ndim_vector.storage() );
+				}
+
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to the underlying storage of \p InternalVectorType and checks
+				 * whether each value in the input vector \p ndim_vector is within the system sizes
+				 * (otherwise it throws).
+				 */
+				size_t ndim_to_linear_check( ConstVectorStorageType ndim_vector ) const {
+					size_t linear = 0;
+					for( size_t i = 0; i < this->dimensions(); i++ ) {
+						if( ndim_vector[ i ] >= this->get_sizes()[ i ] ) {
+							throw std::invalid_argument( "input vector beyond system sizes" );
+						}
+					}
+					return ndim_to_linear( ndim_vector );
+				}
+
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to \p InternalVectorType but does not check whether each value in the input
+				 * vector \p ndim_vector is within the system sizes.
+				 */
+				size_t ndim_to_linear( ConstVectorReference ndim_vector ) const {
+					return this->ndim_to_linear( ndim_vector.storage() );
+				}
+
+				/**
+				 * Computes the linear value the input vector corresponds to; this method takes in input
+				 * a const reference to the underlying storage of \p InternalVectorType but does not check
+				 * whether each value in the input vector \p ndim_vector is within the system sizes.
+				 */
+				size_t ndim_to_linear( ConstVectorStorageType ndim_vector ) const {
+					size_t linear = 0;
+					for( size_t i = 0; i < this->dimensions(); i++ ) {
+						linear += this->_offsets[ i ] * ndim_vector[ i ];
+					}
+					return linear;
+				}
+
+				// must be same dimensionality
+				/**
+				 * Retargets the current object to describe a system with the same number of dimensions
+				 * and sizes \p _new_sizes. If the number of dimensions of \p _new_sizes does not match
+				 * #dimensions(), an exception is thrown.
+				 */
+				void retarget( ConstVectorReference _new_sizes ) {
+					if( _new_sizes.dimensions() != this->_sizes.dimensions() ) {
+						throw std::invalid_argument( "new system must have same dimensions as previous: new "
+						+ std::to_string( _new_sizes.dimensions() ) + ", old "
+						+ std::to_string( this->_sizes.dimensions() ) );
+					}
+					this->_sizes = _new_sizes; // copy
+					this->_system_size = compute_range_product( _new_sizes.begin(), _new_sizes.end(),
+						this->_offsets.begin() );
+				}
+
+				/**
+				 * Returns a beginning iterator to the #dimensions()-dimensional system \c this describes.
+				 * The provided iterator references a system point, described both via its #dimensions()-dimensional
+				 * coordinates and via a linear value from \a 0 to #system_size() (excluded).
+				 */
+				Iterator begin() const {
+					return Iterator( *this );
+				}
+
+				/**
+				 * Return an iterator to the end of the system; this iterator should not be
+				 * referenced nor incremented.
+				 */
+				Iterator end() const {
+					return Iterator::make_system_end_iterator( *this );
+				}
+
+			private:
+				VectorType _offsets;
+				size_t _system_size;
+
+				/**
+				 * Incrementally computes the product of the input iterator's range, storing each value
+				 * into the position pointed to the output iterator; the accumulation starts from 1
+				 * (also the first output values), and the last accumulated value is returned directly
+				 * (and not stored). This assumes that the output container can store at least as many values
+				 * as in the input range.
+				 */
+				template<
+					typename IterIn,
+					typename IterOut
+				> static size_t compute_range_product(
+					IterIn in_begin,
+					IterIn in_end,
+					IterOut out_begin
+				) {
+					size_t prod = 1;
+					for( ; in_begin != in_end; ++in_begin, ++out_begin ) {
+						*out_begin = prod;
+						prod *= *in_begin;
+					}
+					return prod;
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM_LINEARIZER
diff --git a/include/graphblas/utils/multigrid/ndim_system.hpp b/include/graphblas/utils/multigrid/ndim_system.hpp
new file mode 100644
index 000000000..5df62ace2
--- /dev/null
+++ b/include/graphblas/utils/multigrid/ndim_system.hpp
@@ -0,0 +1,118 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file ndim_system.cpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of NDimSystem.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Describes a #dimensions()-dimensional system by storing its size along each dimension.
+			 *
+			 * It is meant to represent a grid of #dimensions() dimensions and size #get_sizes()[d]
+			 * for each dimension \a d in the interval <em>[0, #dimensions())<\em>.
+			 *
+			 * @tparam SizeType integral type to store the size of each dimension
+			 * @tparam InternalStorageType internal vector type to store the sizes
+			 */
+			template<
+				typename SizeType,
+				typename InternalVectorType
+			> class NDimSystem {
+			public:
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be an integral type" );
+
+				using VectorType = InternalVectorType;
+				using VectorReference = VectorType &;
+				using ConstVectorReference = const VectorType &;
+				using SelfType = NDimSystem< SizeType, InternalVectorType >;
+
+				/**
+				 * Construct a new NDimSystem object from an iterable range, where each referenced value
+				 * is a size of the system.
+				 *
+				 * The dimension is computed as \a std::distance(begin,end), i.e.
+				 * \p IterT should be a random-access iterator for performance.
+				 *
+				 * @tparam IterT iterator type
+				 * @param begin range begin
+				 * @param end end of range
+				 */
+				template< typename IterType >
+				NDimSystem( IterType begin, IterType end ) noexcept : _sizes( std::distance( begin, end ) ) {
+					std::copy( begin, end, this->_sizes.begin() );
+				}
+
+				/**
+				 * Construct a new NDimSystem object from an std::vector<>, taking its values
+				 * as system sizes and its length as number of dimensions.
+				 */
+				NDimSystem( const std::vector< size_t > & _sizes ) noexcept :
+					SelfType( _sizes.cbegin(), _sizes.cend() ) {}
+
+				/**
+				 * Construct a new NDimSystem object of dimensions \p dimensions
+				 *  and with all sizes initialized to \p max_size
+				 */
+				NDimSystem( size_t _dimensions, size_t max_size ) noexcept : _sizes( _dimensions ) {
+					std::fill_n( this->_sizes.begin(), _dimensions, max_size );
+				}
+
+				NDimSystem() = delete;
+
+				NDimSystem( const SelfType & ) = default;
+
+				NDimSystem( SelfType && ) = delete;
+
+				SelfType & operator=( const SelfType & original ) = default;
+
+				SelfType & operator=( SelfType && original ) = delete;
+
+				inline size_t dimensions() const noexcept {
+					return _sizes.dimensions();
+				}
+
+				/**
+				 * Get the sizes of the represented system as an iterable \p InternalStorageType
+				 * 	object.
+				 */
+				inline ConstVectorReference get_sizes() const noexcept {
+					return this->_sizes;
+				}
+
+			protected:
+				InternalVectorType _sizes;
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_SYSTEM
diff --git a/include/graphblas/utils/multigrid/ndim_vector.hpp b/include/graphblas/utils/multigrid/ndim_vector.hpp
new file mode 100644
index 000000000..5a3ef4144
--- /dev/null
+++ b/include/graphblas/utils/multigrid/ndim_vector.hpp
@@ -0,0 +1,201 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file ndim_vector.hpp
+ * @author Alberto Scolari (alberto.scolari@huawei.com)
+ * Definition of NDimVector.
+ */
+
+#ifndef _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
+#define _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
+
+#include <algorithm>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "linearized_ndim_system.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace multigrid {
+
+			/**
+			 * Maps an N-dimensional vector to an array of data.
+			 *
+			 * The user constructs an object by passing the sizes (as an N-dimensional vector)
+			 * of the iteration space and accesses the stored data via an N-dimensional vector of coordinates.
+			 *
+			 * Example: if the user constructs an NDimVector with 3D sizes \a [2,3,4], she can access data
+			 * via a 3D coordinates vector of ranges \a [0-1]x[0-2]x[0-3] (here \a x denoting the cartesian product)
+			 * by using the #at() method.
+			 *
+			 * This facility allows associating a value of type \p DataType to, for example,
+			 * each element of an N-dimensional grid.
+			 *
+			 * @tparam DataType type of data stored in the array
+			 * @tparam SizeType type for the components of the N-dimensional vector:
+			 * 	the maximum number of stored data is thus \f$ std::numeric_limits<SizeType>::max()^N \f$
+			 * @tparam InternalVectorType storage type of the internal N-dimensional vector
+			 */
+			template<
+				typename DataType,
+				typename SizeType,
+				typename InternalVectorType
+			> class NDimVector {
+			public:
+				static_assert( std::is_default_constructible< DataType >::value,
+					"the stored type is not default constructible" );
+				static_assert( std::is_integral< SizeType >::value, "SizeType must be integral" );
+
+				using ConstDomainVectorReference = typename LinearizedNDimSystem< SizeType,
+					InternalVectorType >::ConstVectorReference;
+				using ConstDomainVectorStorageType = typename InternalVectorType::ConstVectorStorageType;
+				using DomainIterator = typename LinearizedNDimSystem< SizeType, InternalVectorType >::Iterator;
+				using Selftype = NDimVector< DataType, SizeType, InternalVectorType >;
+
+				NDimVector() = delete;
+
+				/**
+				 * Construct a new NDimVector object with sizes read from the iteration range
+				 * and number of dimensions equal to the range distance; the data values are
+				 * \b not initialized.
+				 */
+				template< typename IterT > NDimVector(
+					IterT begin,
+					IterT end
+				) : _linearizer( begin, end ) {
+					this->data = new DataType[ _linearizer.system_size() ];
+				}
+
+				/**
+				 * Construct a new NDimVector object with sizes read from the \p _sizes
+				 * and number of dimensions equal to \p _sizes.size(); the data values are
+				 * \b not initialized.
+				 */
+				NDimVector( const std::vector< size_t > & _sizes ) :
+					NDimVector( _sizes.cbegin(), _sizes.cend() ) {}
+
+				NDimVector( const Selftype & original ) :
+					_linearizer( original._linearizer ),
+					data( new DataType[ original.data_size() ] )
+				{
+					std::copy_n( original.data, original.data_size(), this->data );
+				}
+
+				NDimVector( Selftype && original ) noexcept :
+					_linearizer( std::move( original._linearizer ) ) {
+					this->data = original.data;
+					original.data = nullptr;
+				}
+
+				Selftype & operator=( const Selftype & original ) = delete;
+
+				Selftype & operator=( Selftype && original ) = delete;
+
+				~NDimVector() {
+					this->clean_mem();
+				}
+
+				/**
+				 * Number of dimensions of the underlying geometrical space.
+				 */
+				size_t dimensions() const {
+					return this->_linearizer.dimensions();
+				}
+
+				/**
+				 * Size of the the underlying geometrical space, i.e. number of stored data elements.
+				 */
+				size_t data_size() const {
+					return this->_linearizer.system_size();
+				}
+
+				/**
+				 * Access the data element at N-dimension coordinate given by the iterable
+				 * \p coordinates.
+				 */
+				inline DataType & at( ConstDomainVectorReference coordinates ) {
+					return this->data[ this->get_coordinate( coordinates.storage() ) ];
+				}
+
+				/**
+				 * Const-access the data element at N-dimension coordinate given by the iterable
+				 * \p coordinates.
+				 */
+				inline const DataType & at( ConstDomainVectorReference coordinates ) const {
+					return this->data[ this->get_coordinate( coordinates.storage() ) ];
+				}
+
+				/**
+				 * Access the data element at N-dimension coordinate given by the vector
+				 * storage object \p coordinates.
+				 */
+				inline DataType & at( ConstDomainVectorStorageType coordinates ) {
+					return this->data[ this->get_coordinate( coordinates ) ];
+				}
+
+				/**
+				 * Const-access the data element at N-dimension coordinate given by the vector
+				 * storage object \p coordinates.
+				 */
+				inline const DataType & at( ConstDomainVectorStorageType coordinates ) const {
+					return this->data[ this->get_coordinate( coordinates ) ];
+				}
+
+				/**
+				 * Returns an iterator to the beginning of the N-dimensional underlyign space,
+				 * i.e. a vector \a [0,0,0,...,0].
+				 */
+				DomainIterator domain_begin() const {
+					return this->_linearizer.begin();
+				}
+
+				/**
+				 * Returns an iterator to the end of the N-dimensional underlyign space.
+				 * This iterator should not be referenced nor incremented.
+				 */
+				DomainIterator domain_end() const {
+					return this->_linearizer.end();
+				}
+
+			private:
+				const LinearizedNDimSystem< SizeType, InternalVectorType > _linearizer;
+				DataType * data;
+
+				inline size_t get_coordinate( ConstDomainVectorStorageType coordinates ) const {
+					return this->_linearizer.ndim_to_linear( coordinates );
+				}
+
+				inline size_t get_coordinate( DomainIterator coordinates ) const {
+					return this->_linearizer.ndim_to_linear( coordinates );
+				}
+
+				void clean_mem() {
+					if( this->data == nullptr ) {
+						delete[] this->data;
+					}
+				}
+			};
+
+		} // namespace multigrid
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_ALGORITHMS_MULTIGRID_NDIM_VECTOR
diff --git a/include/graphblas/utils/telemetry/CSVWriter.hpp b/include/graphblas/utils/telemetry/CSVWriter.hpp
new file mode 100644
index 000000000..d92d5efd1
--- /dev/null
+++ b/include/graphblas/utils/telemetry/CSVWriter.hpp
@@ -0,0 +1,354 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file CSVWriter.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the CSVWriter class.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_CSV_WRITER
+#define _H_GRB_UTILS_TELEMETRY_CSV_WRITER
+
+#include <fstream>
+#include <initializer_list>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/// standard CSV separator
+			static constexpr char STD_CSV_SEP = ',';
+
+			template< class U, class... Us >
+			struct __is_csv_printable {
+				static constexpr bool value = std::is_arithmetic< U >::value;
+			};
+
+			template< class U1, class U2, class... Us >
+			struct __is_csv_printable< U1, U2, Us... > {
+				static constexpr bool value = __is_csv_printable< U1 >::value
+					&& __is_csv_printable< U2, Us... >::value;
+			};
+
+			/**
+			 * Class to store numerical information in form of lines and emit it as a CSV, with
+			 * heading, field separator and newlines.
+			 *
+			 * The user should add an line at once via #add_line( UTypes && ) and can
+			 * then output it to an \a std::ostream or a file, together with the
+			 * heading specified at construction. The output is a fully compliant CSV file
+			 * that can be read by common tools like spreadsheets and parsers (e.g. Pandas,
+			 * https://pandas.pydata.org/). This class allows easily emitting telemetry
+			 * information and importing them into advanced tools for thourough analysis.
+			 *
+			 * This implementation assumes telemetry is enabled, since a specialization for
+			 * disabled telemetry follows.
+			 * It internally allocates memory dynamically to store the lines.
+			 * Only numerical information can be stored.
+			 *
+			 * @tparam TelControllerType type for the telemetry controller
+			 * @tparam enabled whether telemetry is enabled
+			 * @tparam T1 numerical type of the first field to store (at least one is required)
+			 * @tparam Ts numerical types of the following fields to store
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled,
+				class T1,
+				class... Ts
+			> class CSVWriter :
+				public TelemetryBase< TelControllerType, enabled > {
+			public:
+				static_assert( __is_csv_printable< T1, Ts... >::value, "not all types are printable" );
+
+				using self_t = CSVWriter< TelControllerType, enabled, T1, Ts... >;
+
+				using base_t = TelemetryBase< TelControllerType, enabled >;
+
+				class CSVLastTuple {
+				public:
+					CSVLastTuple( const self_t & _csv ) : csv( _csv ) {}
+
+					CSVLastTuple( const CSVLastTuple & clt ) : csv( clt.csv ) {}
+
+					inline friend std::ostream & operator<<( std::ostream & stream, const CSVLastTuple & t ) {
+						return t.csv.write_last_line_to_stream( stream );
+					}
+
+				private:
+					const self_t & csv;
+				};
+
+				CSVWriter() = delete;
+
+				/**
+				 * Full constructor for a CSVWriter.
+				 *
+				 * @param tt telemetry controller
+				 * @param _headers CSV headers, whose number must match the number of T types to print
+				 * @param _separator field separator for printing
+				 * @param size hint size for initial memory allocation (dynamic allocation may occur anyway)
+				 */
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers,
+					char _separator,
+					size_t size
+				) :
+					base_t( tt ),
+					separator( _separator )
+				{
+					if( _headers.size() != NUM_FIELDS ) {
+						throw std::runtime_error( "wrong number of headers, it must match the unmber of line elements" );
+					}
+					// emplace anyway, so that the object is always in a consistent state and can be
+					// activated/deactivated at runtime
+					for( const auto & h : _headers ) {
+						headers.emplace_back( h );
+					}
+					if( ! tt.is_active() ) {
+						return;
+					}
+					lines.reserve( size );
+					// zero to force physical allocation
+					// std::memset( reinterpret_cast< void * >( lines.data() ), 0, lines.size() * sizeof( tuple_t ) );
+				}
+
+				/**
+				 * Construct a new CSVWriter object assuming a comma separator and an initial
+				 * amount of lines to store.
+				 */
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers
+				) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
+
+				CSVWriter( const self_t & ) = delete;
+
+				CSVWriter( self_t && ) = delete;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				self_t & operator=( self_t && ) = delete;
+
+				/**
+				 * Add a line to the CSV, i.e., store the numerical information internally.
+				 *
+				 * @tparam UTypes information types whose number must match the number of fields in the CSV;
+				 * 	these types must also be implicitly convertible to the corresponding T1, Ts... types
+				 */
+				template< class... UTypes >
+				void add_line( UTypes &&... vs ) {
+					if( this->is_active() ) {
+						lines.emplace_back( std::forward< UTypes >( vs )... );
+					}
+				}
+
+				/**
+				 * Remove all lines from the CSV.
+				 */
+				void clear() {
+					lines.clear();
+				}
+
+				/**
+				 * Emit the last line of the CSV into \p stream as actual text, i.e. with the fields separated.
+				 * Does not print the newline.
+				 *
+				 * If there is no line stored, the behavior is undefined.
+				 *
+				 * @param stream stream to write into
+				 * @return std::ostream& \p stream itself
+				 */
+				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
+					if( lines.size() > 0 && this->is_active() ) {
+						write_line( stream, lines.back() );
+					}
+					return stream;
+				}
+
+				/**
+				 * Returns an object that can be streamed into an std::cout stream via the \a << operator
+				 * in order to print the last line stored.
+				 *
+				 * If there is no line stored, the behavior is undefined.
+				 */
+				CSVLastTuple last_line() const {
+					if( lines.size() == 0 ) {
+						throw std::runtime_error( "no measures" );
+					}
+					return CSVLastTuple( *this );
+				}
+
+				/**
+				 * Write the entire CSV into \p stream, with heading (heading, separated fields with newline).
+				 */
+				std::ostream & write_to_stream( std::ostream & stream ) const {
+					if( ! this->is_active() ) {
+						return stream;
+					}
+					write_header( stream );
+					stream << NEW_LINE;
+					for( const tuple_t & line : lines ) {
+						write_line( stream, line );
+						stream << NEW_LINE;
+					}
+					return stream;
+				}
+
+				/**
+				 * Creates a new file named \p name (or overwrites an existing one) and stores the entire CSV
+				 * into it.
+				 */
+				void write_to_file( const char * name ) const {
+					if( ! this->is_active() ) {
+						return;
+					}
+					std::ofstream file( name );
+					if( ! file.is_open() ) {
+						throw std::runtime_error( "cannot open file" );
+					}
+					write_to_stream( file );
+					file.close();
+				}
+
+			private:
+				static constexpr char NEW_LINE = '\n';
+
+				static constexpr size_t NUM_FIELDS = sizeof...( Ts ) + 1;
+
+				using tuple_t = std::tuple< T1, Ts... >;
+
+				std::vector< std::string > headers;
+				const char separator;
+				std::vector< tuple_t > lines;
+
+				std::ostream & write_header( std::ostream & stream ) const {
+					stream << headers[ 0 ];
+					for( size_t i = 1; i < headers.size(); i++ ) {
+						stream << separator << headers[ i ];
+					}
+					return stream;
+				}
+
+				void write_line( std::ostream & stream, const tuple_t & line ) const {
+					write_val< 0 >( stream, line );
+				}
+
+				// recursive case
+				template< size_t OFFS >
+				inline void write_val( std::ostream & stream, typename std::enable_if < OFFS< NUM_FIELDS - 1, const tuple_t & >::type _tup ) const {
+					stream << std::get< OFFS >( _tup ) << separator;
+					write_val< OFFS + 1 >( stream, _tup ); // tail recursion
+				}
+
+				// base case
+				template< size_t OFFS >
+				inline void write_val( std::ostream & stream, typename std::enable_if< OFFS == NUM_FIELDS - 1, const tuple_t & >::type _tup ) const {
+					(void)separator;
+					stream << std::get< OFFS >( _tup );
+				}
+			};
+
+			/**
+			 * Temaplate specialization that assumes disabled telemetry: no state is kept,
+			 * operations produce no result when invoked (no output into streams, no file creation).
+			 *
+			 * @tparam TelControllerType
+			 * @tparam T1
+			 * @tparam Ts
+			 */
+			template<
+				typename TelControllerType,
+				class T1,
+				class... Ts
+			> class CSVWriter< TelControllerType, false, T1, Ts... > :
+				public TelemetryBase< TelControllerType, false > {
+			public:
+				static_assert( __is_csv_printable< T1, Ts... >::value, "not all types are printable" );
+
+				using self_t = CSVWriter< TelControllerType, false, T1, Ts... >;
+
+				using base_t = TelemetryBase< TelControllerType, false >;
+
+				CSVWriter() = delete;
+
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * >,
+					char,
+					size_t
+				) : base_t( tt ) {}
+
+				CSVWriter(
+					const TelControllerType & tt,
+					std::initializer_list< const char * > _headers
+				) : CSVWriter( tt, _headers, STD_CSV_SEP, 10 ) {}
+
+				CSVWriter( const self_t & ) = delete;
+
+				CSVWriter( self_t && ) = delete;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				self_t & operator=( self_t && ) = delete;
+
+				template< class... UTypes > void add_line( UTypes &&... ) {
+					static_assert( sizeof...( UTypes ) == sizeof...( Ts ) + 1 );
+				}
+
+				void clear() {}
+
+				std::ostream & write_last_line_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				char last_line() const {
+					return '\0';
+				}
+
+				std::ostream & write_to_stream( std::ostream & stream ) const {
+					return stream;
+				}
+
+				void write_to_file( const char * name ) const {
+					(void)name;
+				}
+			};
+
+			/**
+			 * Implementation of CSVWriter for enabled telemetry, with implemented operations.
+			 */
+			template< class T1, class... Ts >
+			using StaticCSVWriter = CSVWriter< TelemetryControllerAlwaysOn, true, T1, Ts... >;
+
+		} // namespace telemetry
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_UTILS_TELEMETRY_CSV_WRITER
diff --git a/include/graphblas/utils/telemetry/OutputStream.hpp b/include/graphblas/utils/telemetry/OutputStream.hpp
new file mode 100644
index 000000000..3d7c9fb1b
--- /dev/null
+++ b/include/graphblas/utils/telemetry/OutputStream.hpp
@@ -0,0 +1,255 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file OutputStream.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the OutputStream class.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
+#define _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
+
+#include <ostream>
+#include <type_traits>
+#include <utility>
+#include <functional>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * SFINAE-based class to check whether the type \p T can be input to an std::ostream
+			 * via the \a << operator.
+			 */
+			template< typename T > struct is_ostream_input {
+			private:
+
+				template< typename U > static constexpr bool is_input(
+					typename std::enable_if< std::is_same<
+						// this means that the expression std::cout << obj is valid, where obj is of type T
+						decltype( std::declval< std::ostream& >() << std::declval< U >() ),
+						std::ostream& >::value, nullptr_t >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool is_input( ... ) {
+					return false;
+				}
+
+			public:
+				static constexpr bool value = is_input< T >( nullptr );
+			};
+
+			/**
+			 * Telemetry-controllable output stream with basic interface, based on the \a << operator.
+			 *
+			 * It accepts in input any type \a std::ostream accepts. In addition, it also accepts
+			 * the internl #OutputStreamLazy<RetType> type, which marks callable objects and allows
+			 * lazy evaluation of their result if the telemetry is active; if not, the object is
+			 * not called, avoiding runtime costs. This functionality allows paying time and memory
+			 * costs of computation only if really needed.
+			 *
+			 * @tparam TelControllerType type of the telemetry controller
+			 * @tparam enabled whether telemetry is enabled for this type
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class OutputStream : public TelemetryBase< TelControllerType, enabled > {
+			public:
+				using self_t = OutputStream< TelControllerType, enabled >;
+
+				using base_t = TelemetryBase< TelControllerType, enabled >;
+
+				/**
+				 * Marker object to indicate that the stored callable object is to be called
+				 * in a lazy way, i.e., only if output is active.
+				 *
+				 * @tparam RetType return type of the collable object, to be printed
+				 */
+				template< typename RetType > class OutputStreamLazy {
+
+					const std::function< RetType() > f;
+
+				public:
+					static_assert( is_ostream_input< RetType >::value );
+
+					template< class F > OutputStreamLazy( F&& _f ) : f( std::forward< F >( _f ) ) {}
+
+					RetType operator()() const { return f(); }
+				};
+
+				/**
+				 * Convenience function to create an #OutputStreamLazy<RetType> object from
+				 * a callable one, inferring all template parameters automatically.
+				 *
+				 * @tparam CallableType type of the given callable object
+				 * @tparam RetType return type of the callable object, to be printed
+				 * @param f callable object
+				 * @return OutputStreamLazy< RetType > object marking lazy evaluation for printing
+				 */
+				template<
+					typename CallableType,
+					typename RetType = decltype( std::declval< CallableType >()() )
+				> static OutputStreamLazy< RetType > makeLazy( CallableType&& f ) {
+					static_assert( is_ostream_input< RetType >::value );
+					return OutputStreamLazy< RetType >( std::forward< CallableType >( f ) );
+				}
+
+				/**
+				 * Construct a new Output Stream object from a telemetry controller \p -tt
+				 * and an output stream \p _out (usually \a std::cout)
+				 */
+				OutputStream(
+					const TelControllerType & _tt,
+					std::ostream & _out
+				) :
+					TelemetryBase< TelControllerType, enabled >( _tt ),
+					out( _out )
+				{}
+
+				/**
+				 * Copy constructor.
+				 */
+				OutputStream( const self_t & _outs ) = default;
+
+				OutputStream & operator=( const self_t & _out ) = delete;
+
+				/**
+				 * Stream input operator, enabled for all types std::ostream supports.
+				 */
+				template< typename T > inline typename std::enable_if< is_ostream_input< T >::value,
+					self_t & >::type operator<<( T&& v ) {
+					if ( this->is_active() ) {
+						out << std::forward< T >( v );
+					}
+					return *this;
+				}
+
+				/**
+				 * Specialization of the \a << operator for stream manipulators, to support
+				 * \a std::endl and similar manipulators.
+				 */
+				inline self_t & operator<<( std::ostream& (*func)( std::ostream& ) ) {
+					if ( this->is_active() ) {
+						out << func;
+					}
+					return *this;
+				}
+
+				/**
+				 * Specialization of the \a << operator for lazy evaluation of callable objects.
+				 *
+				 * A callable object can be wrapped into an #OutputStreamLazy<F> object in order
+				 * to be called only if necessary, i.e., only if the stream \a this is active.
+				 * In this case, the internal callable object is called, its result is materialized
+				 * and sent into the stream.
+				 *
+				 * To conveniently instantiate an #OutputStreamLazy<F> to pass to this operator,
+				 * see #makeLazy(CallableType&&).
+				 *
+				 * @tparam F type of the callable object
+				 * @param fun callable object
+				 * @return self_t & the stream itself
+				 */
+				template< class F > inline typename std::enable_if<
+					is_ostream_input< decltype( std::declval< OutputStreamLazy< F > >()() ) >::value,
+				self_t & >::type operator<<( const OutputStreamLazy< F >& fun ) {
+					if ( this->is_active() ) {
+						out << fun();
+					}
+					return *this;
+				}
+
+			private:
+				std::ostream & out;
+			};
+
+			/**
+			 * Template specialization of OutputStream<TelControllerType,enabled>
+			 * for deactivated telemetry: no information is stored, no output produced.
+			 */
+			template<
+				typename TelControllerType
+			> class OutputStream< TelControllerType, false > :
+				public TelemetryBase< TelControllerType, false > {
+			public:
+				using self_t = OutputStream< TelControllerType, false >;
+
+
+				template< typename RetType > struct OutputStreamLazy {
+
+					static_assert( is_ostream_input< RetType >::value );
+
+					template< class F > OutputStreamLazy( F&& ) {}
+
+					constexpr char operator()() const { return '\0'; }
+				};
+
+				template<
+					typename CallableType,
+					typename RetType = decltype( std::declval< CallableType >()() )
+				> static OutputStreamLazy< RetType > makeLazy( CallableType&& f ) {
+					static_assert( is_ostream_input< RetType >::value );
+					return OutputStreamLazy< RetType >( std::forward< CallableType >( f ) );
+				}
+
+				OutputStream() = default;
+
+				OutputStream( const TelControllerType & _tt, std::ostream & ) :
+					TelemetryBase< TelControllerType, false >( _tt ) {}
+
+				OutputStream( const self_t & _out ) = default;
+
+				OutputStream & operator=( const self_t & _out ) = delete;
+
+				inline self_t & operator<<( std::ostream& (*)( std::ostream& ) ) {
+					return *this;
+				}
+
+				/**
+				 * All-capturing implementation for the input stream operator, printing nothing.
+				 *
+				 * This operator is convenient especially for debugging cases.
+				 * In case of "normal" stream types used with custom data types, the user
+				 * must extend them manually to print the custom data type. If the user uses a
+				 * deactivated stream (for example as a default template parameter to disable
+				 * logging by default), she needs not extend it for custom types in order
+				 * to make it compile, which is especially nonsensical when the output is deactivated.
+				*/
+				template< typename T > self_t & operator<<( T&& ) {
+					return *this;
+				}
+			};
+
+			/// Always active output stream, mainly for debugging purposes.
+			using OutputStreamOn = OutputStream< TelemetryControllerAlwaysOn, true >;
+
+			/// Always inactive output stream
+			using OutputStreamOff = OutputStream< TelemetryControllerAlwaysOff, false >;
+
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_OUTPUT_STREAM
diff --git a/include/graphblas/utils/telemetry/Stopwatch.hpp b/include/graphblas/utils/telemetry/Stopwatch.hpp
new file mode 100644
index 000000000..f599ede03
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Stopwatch.hpp
@@ -0,0 +1,238 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Stopwatch.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the Stopwatch class.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_STOPWATCH
+#define _H_GRB_UTILS_TELEMETRY_STOPWATCH
+
+#include <chrono>
+
+#include "TelemetryBase.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * Type to store time duration in nanoseconds, which is the default time granularity.
+			 */
+			using duration_nano_t = size_t;
+
+			/**
+			 * Duration as floating point type, for time granularities coarser than nanoseconds.
+			 */
+			using duration_float_t = double;
+
+			/**
+			 * Base class for Stopwatch, with common logic.
+			 */
+			class StopwatchBase {
+			public:
+
+				/**
+				 * Convert nanoseconds to microseconds, returned as floating point type duration_float_t.
+				 */
+				static inline duration_float_t nano2Micro( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000UL;
+				}
+
+				/**
+				 * Convert nanoseconds to milliseconds, returned as floating point type duration_float_t.
+				 */
+				static inline duration_float_t nano2Milli( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000000UL;
+				}
+
+				/**
+				 * Convert nanoseconds to seconds, returned as floating point type duration_float_t.
+				 */
+				static inline duration_float_t nano2Sec( duration_nano_t nano ) {
+					return static_cast< duration_float_t >( nano ) / 1000000000UL;
+				}
+			};
+
+			/**
+			 * Class with functionalities to measure elapsed time for telemetry purposes: start, stop, reset.
+			 *
+			 * The time granularity is nanoseconds.
+			 *
+			 * Copy semantics is not available.
+			 *
+			 * This implementation assumes telemetry is enabled and the active state is controlled via
+			 * a telemetry controller of type \p TelControllerType.
+			 *
+			 * @tparam TelControllerType underlying telemetry controller type
+			 * @tparam enabled whether it is compile-time enabled
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class Stopwatch :
+				public StopwatchBase, public TelemetryBase< TelControllerType, enabled > {
+
+				typedef typename std::chrono::high_resolution_clock clock_t;
+
+				typedef typename std::chrono::nanoseconds duration_t;
+
+				typedef typename std::chrono::high_resolution_clock::time_point time_point_t;
+
+				duration_t elapsedTime; ///< measured elapsed time so far, i.e.,
+				///< accumulated time periods between successive calls to #start() and #stop()
+
+				time_point_t beginning; ///< time instant of last call to #start()
+
+			public:
+				/**
+				 * Construct a new Stopwatch object from a telemetry controller.
+				 *
+				 * @param tt underlying telemetry controller, to be (de)activated at runtime
+				 */
+				Stopwatch( const TelControllerType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelControllerType, true >( tt ),
+					elapsedTime( duration_t::zero() ) {}
+
+				Stopwatch( const Stopwatch< TelControllerType, enabled > &  ) = delete;
+
+				/**
+				 * Start measuring time.
+				 *
+				 * Subsequent calls to this method "reset" the measure of elapsed time: if the user calls #start()
+				 * twice and then #stop(), the elapsed time accumulated internally after the call to #stop() is
+				 * the time elapsed from the \b second call of #start() to the call to #stop().
+				 */
+				inline void start() {
+					if( this->is_active() ) {
+						beginning = clock_t::now();
+					}
+				}
+
+				/**
+				 * Stops time measurement, returning the elapsed time since the last #start() invocation.
+				 * Elapsed time is internally accounted only if this method is invoked.
+				 */
+				inline duration_nano_t stop() {
+					duration_nano_t count = 0;
+					if( this->is_active() ) {
+						time_point_t end = clock_t::now();
+						duration_t d = end - beginning;
+						count = d.count();
+						elapsedTime += d;
+					}
+					return count;
+				}
+
+				/**
+				 * Returns the elapsed time, which is accounted \b only if #stop() is called.
+				 *
+				 * The value of the elapsed time is not erased, so that successive calls return
+				 * the same value.
+				 */
+				inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( elapsedTime.count() );
+				}
+
+				/**
+				 * To be called on a stopped watch, it returns the elapsed time and sets it to 0.
+				 */
+				inline duration_nano_t reset() {
+					duration_nano_t r = getElapsedNano();
+					if( this->is_active() ) {
+						elapsedTime = duration_t::zero();
+					}
+					return r;
+				}
+
+				/**
+				 * Stops the watch, sets the elapsed time to 0, starts it again
+				 * and returns the time elapsed between the previous #start()
+				 * and the #stop() internally called.
+				*/
+				inline duration_nano_t restart() {
+					stop();
+					duration_nano_t r = reset();
+					start();
+					return r;
+				}
+			};
+
+			/**
+			 * Template specialization of Stopwatch<TelControllerType, enabled> for disabled telemetry:
+			 * no state is stored, all functions are inactive.
+			 */
+			template<
+				typename TelControllerType
+			> class Stopwatch< TelControllerType, false > :
+				public StopwatchBase, public TelemetryBase< TelControllerType, false > {
+			public:
+				Stopwatch( const TelControllerType & tt ) :
+					StopwatchBase(),
+					TelemetryBase< TelControllerType, false >( tt ) {}
+
+				Stopwatch( const Stopwatch< TelControllerType, false > & ) = delete;
+
+				constexpr inline void start() {}
+
+				constexpr inline duration_nano_t stop() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				constexpr inline duration_nano_t reset() {
+					return static_cast< duration_nano_t >( 0 );
+
+				}
+
+				constexpr inline duration_nano_t restart() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			};
+
+			/**
+			 * Always active stopwatch, requiring no telemetry controller for construction.
+			 * Mainly for debugging purposes.
+			 */
+			class ActiveStopwatch : public Stopwatch< TelemetryControllerAlwaysOn, true > {
+			public:
+
+				using base_t = Stopwatch< TelemetryControllerAlwaysOn, true >;
+
+				ActiveStopwatch():
+					base_t( tt ),
+					tt( true ) {}
+
+				ActiveStopwatch( const ActiveStopwatch & ) = delete;
+
+			private:
+				TelemetryControllerAlwaysOn tt;
+			};
+
+		} // namespace telemetry
+	}     // namespace utils
+} // namespace grb
+
+#endif // _H_GRB_UTILS_TELEMETRY_STOPWATCH
diff --git a/include/graphblas/utils/telemetry/Telemetry.hpp b/include/graphblas/utils/telemetry/Telemetry.hpp
new file mode 100644
index 000000000..3da512b82
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Telemetry.hpp
@@ -0,0 +1,51 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @dir include/graphblas/utils/telemetry
+ * This folder contains all telemetry functionalities, i.e., those meant to measure
+ * and report code execution in detail. They are designed with two goals in mind:
+ *   -# <b>compile-time control</b>: all functionalities can be activated or deactivated
+ * 		at compile-time; if deactivated, they incur no runtime and memory cost
+ *   -# <b>fine granularity</b>: since telemetry is complex and very application-specific,
+ * 		they allow fine-grained measurement and reporting; hence, they are also meant
+ * 		to be conveniently integrated into an existing application at fine granularity
+ *   -# <b>no pre-processor cluttering</b>: multiple specializations may exist for the same functionality,
+ * 		for example to avoid memory or runtime costs if telemetry is deactivated; all
+ * 		implementations \b must compile against the same code paths, to avoid verbose
+ * 		insertion of #ifdef or similar directives on user's behalf.
+ *
+ * See the documentation of TelemetryController.hpp for some basic examples.
+ */
+
+/**
+ * @file OutputStream.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Convenience all-include header for all telemetry-related functionalities.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY
+
+#include "TelemetryController.hpp"
+#include "Stopwatch.hpp"
+#include "Timeable.hpp"
+#include "CSVWriter.hpp"
+#include "OutputStream.hpp"
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY
diff --git a/include/graphblas/utils/telemetry/TelemetryBase.hpp b/include/graphblas/utils/telemetry/TelemetryBase.hpp
new file mode 100644
index 000000000..04773591a
--- /dev/null
+++ b/include/graphblas/utils/telemetry/TelemetryBase.hpp
@@ -0,0 +1,123 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TelemetryBase.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the TelemetryBase class.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
+
+#include "TelemetryController.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * Base class provided as a convenience, exposing whether the telemetry is active.
+			 *
+			 * Default contruction is unavailable, because telemetry functionalities need an
+			 * underlying telemetry controller to know whether they are enabled and active.
+			 *
+			 * Instead, copy construction is available for inheriting classes to easily implement copy semantics
+			 * if needed; the copy shares the same telemetry controller of the original object via a reference.
+			 *
+			 * This implementation corresponds to enabled telemetry and stores an actual
+			 * telemetry controller at runtime to be notified about its active state.
+			 *
+			 * @tparam TelControllerType type of the underlying telemetry controller,
+			 * 	usually derived from TelemetryControllerBase
+			 * @tparam enabled whther the current type is enabled (usually equals to TelControllerType::enabled)
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class TelemetryBase {
+
+				const TelControllerType & telemetry_Controller;
+
+			public:
+				static_assert( is_telemetry_controller< TelControllerType >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
+
+				using self_t = TelemetryBase< TelControllerType, enabled >;
+
+				TelemetryBase( const TelControllerType & tt ): telemetry_Controller( tt ) {}
+
+				TelemetryBase( const self_t & tb ) : telemetry_Controller( tb.telemetry_Controller ) {}
+
+				self_t & operator=( const self_t & ) = delete;
+
+				bool is_active() const { return telemetry_Controller.is_active(); }
+			};
+
+			/**
+			 * Template specialization for disabled telemetry: no state, no activity.
+			 *
+			 * @tparam TelControllerType
+			 */
+			template <
+				typename TelControllerType
+			> class TelemetryBase< TelControllerType, false > {
+			public:
+				static_assert( is_telemetry_controller< TelControllerType >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
+
+				using self_t = TelemetryBase< TelControllerType, false >;
+
+				TelemetryBase() = default;
+
+				TelemetryBase( const TelControllerType & ) {}
+
+				TelemetryBase( const self_t & ) = default;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				constexpr bool is_active() const { return false; }
+			};
+
+			/**
+			 * Specialization of TelemetryControllerBase for enabled and always active telemetry,
+			 * mainly for debugging purposes: it is always active.
+			 *
+			 * For API compliance, it accepts an always-on telemetry controller, but does not store it.
+			 */
+			template<> class TelemetryBase< TelemetryControllerAlwaysOn, true > {
+			public:
+				static_assert( is_telemetry_controller< TelemetryControllerAlwaysOn >::value,
+					"type TelControllerType does not implement Telemetry Controller interface" );
+
+				using self_t = TelemetryBase< TelemetryControllerAlwaysOn, true >;
+
+				TelemetryBase( const TelemetryControllerAlwaysOn & tt ) { (void) tt; }
+
+				TelemetryBase( const self_t & tb ) = default;
+
+				self_t & operator=( const self_t & ) = delete;
+
+				constexpr bool is_active() const { return true; }
+			};
+
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_BASE
diff --git a/include/graphblas/utils/telemetry/TelemetryController.hpp b/include/graphblas/utils/telemetry/TelemetryController.hpp
new file mode 100644
index 000000000..f32c9ca21
--- /dev/null
+++ b/include/graphblas/utils/telemetry/TelemetryController.hpp
@@ -0,0 +1,328 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file TelemetryController.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * This file defines the basic functionalities for <b>Telemetry Controllers</b>, i.e.,
+ * objects that enable/disable telemetry at compile-time and runtime.
+ *
+ * A telemetry controller can be \b enabled (at compile-time) to produce the code for telemetry and must be
+ * \b activated at runtime to emit actual telemetry information. Activation depends on runtime information
+ * (e.g., user's input) and may change dynamically \a after the controller is instantiated.
+ * If a controller is \b disabled, no code for compile-time is generated in any compliant telemetry functionality;
+ * hence, any (de)activation of a disabled telemetry controller is simply ignored and produces no result.
+ * In any case, the code must compile under all conditions, in order to avoid verbose
+ * pre-processing \a #if conditions.
+ *
+ * A typical instantiation of a telemetry controller in a user's application looks as follows:
+ *
+ * \code{.cpp}
+ * ENABLE_TELEMETRY_CONTROLLER( my_controller_t )
+ * DEFINE_TELEMETRY_CONTROLLER( my_controller_t )
+ *
+ * int main() {
+ * 		my_controller_t my_controller( true );
+ * 		if( my_controller.is_active() ) {
+ * 			std::cout << "my_controller is active";
+ * 		} else {
+ * 			std::cout << "my_controller is NOT active";
+ * 			if( !my_controller_t::enabled ) {
+ * 				std::cout << ", because it was deactivated at compile-time";
+ * 			}
+ * 		}
+ * 		std::cout << std::endl;
+ * 		return 0;
+ * }
+ * \endcode
+ *
+ * where the activation directive \a ENABLE_TELEMETRY_CONTROLLER is present only if the controller
+ * is to be activated. Users should indeed comment/uncomment this directive do disable/enable telemetry
+ * while debugging, or may add extra pre-processing logic to control it during compilation, like
+ *
+ * \code{.cpp}
+ * #ifdef __I_WANT_my_controller_t_ENABLED__
+ * 		ENABLE_TELEMETRY_CONTROLLER( my_controller_t )
+ * #endif
+ * DEFINE_TELEMETRY_CONTROLLER( my_controller_t )
+ * \endcode
+ *
+ * Note that the \a ENABLE_TELEMETRY_CONTROLLER directive (if present) must come \b before the
+ * \a DEFINE_TELEMETRY_CONTROLLER directive, otherwise compilation errors occur.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
+#define _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
+
+#include <type_traits>
+#include <utility> // std::declval< T >()
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * Returns whether a telemetry controller is enabled <b>at compile-time</b>. By default
+			 * it is \b not.
+			 *
+			 * @tparam T type associated to the telemetry controller
+			 * @return true never
+			 * @return false always
+			 */
+			template< typename T > constexpr bool is_controller_enabled() { return false; }
+
+			/**
+			 * Class that encapsulates the logic to enable/disable telemetry at compile-time
+			 * or at runtime.
+			 *
+			 * Telemetry can be completely disabled at compile-time (e.g., to avoid any code generation
+			 * and overhead) or can be controlled at runtime, based on external conditions (e.g.,
+			 * user's input, cluster node number, ...).
+			 *
+			 * In the following, the field #enabled encodes the compile-time information, while
+			 * the field \a active (if present) and the corresponding getter #is_active() tell
+			 * whether the controller is \a active at runtime. Hence, users of telemetry controllers should always
+			 * use the #is_active() method to check whether telemetry is active, while implementations
+			 * of telemetry controllers should implement this method also based on the value of the #enabled
+			 * field, possibly "short-circuiting" when #enabled is \a false. This implementation does
+			 * exactly this, disabling telemetry at compile-time and ignoring any runtime information.
+			 *
+			 * Copy semantics is not available, because a controller stores just one piece of information
+			 * (whether it is active) and a copy would essentially behave as a new object.
+			 * Therefore, users should rather create new controllers themselves or pass around references
+			 * to the same controller, in order to centralize control via a single controller object.
+			 *
+			 * Also move semantics is not available, since an "empty" controller makes no sense.
+			 *
+			 * This implementation assumes \p en = \a true, because a specialization for
+			 * \p en = \a false exists (hence #enabled is set as \a true at compile-time).
+			 *
+			 * @tparam en whether telemetry is enabled (\p en = \a false has a
+			 * dedicated template specialization)
+			 */
+			template< bool en > class TelemetryControllerBase {
+			public:
+				using self_t = TelemetryControllerBase< en >;
+
+				/**
+				 * Construct a new Telemetry oCntroller Base object, specifying the \a active state.
+				 *
+				 * @param _active whether the controller is \a active or not
+				 */
+				TelemetryControllerBase( bool _active ) : active( _active ) {}
+
+				TelemetryControllerBase() = delete;
+
+				TelemetryControllerBase( const self_t & ) = default;
+
+				TelemetryControllerBase& operator=( const self_t & ) = delete;
+
+				/**
+				 * Tells whether the controller is \a active.
+				*/
+				bool is_active() const { return this->active; }
+
+				/**
+				 * Set the \a active status of the controller at runtime.
+				 *
+				 * @param _active whether to activate the controller
+				 */
+				void inline set_active( bool _active ) {
+					this->active = _active;
+				}
+
+				/**
+				 * Whether telemetry is compile-time active (here always).
+				*/
+				static constexpr bool enabled = true;
+
+			protected:
+				bool active;
+			};
+
+			/**
+			 * Template specialization for compile-time disabled telemetry,
+			 * whose functionalities are all disabled.
+			 *
+			 * The controller is \b disabled by default, and modifications to
+			 * its \a active status are ignored.
+			 */
+			template< > class TelemetryControllerBase< false > {
+			public:
+				using self_t = TelemetryControllerBase< false >;
+
+				/**
+				 * Construct a new Telemetry Controller Base object with runtime information.
+				 *
+				 * Here, runtime information is ignored, as this implementation disables any telemetry.
+				 *
+				 * @param _enabled whether telemetry is runtime-enabled (ignored here)
+				 */
+				TelemetryControllerBase( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryControllerBase() = delete;
+
+				TelemetryControllerBase( const self_t & ) = delete;
+
+				TelemetryControllerBase& operator=( const self_t & ) = delete;
+
+				/**
+				 * Whether telemetry is runtime-active.
+				 *
+				 * @return true never here
+				 * @return false always
+				 */
+				constexpr bool inline is_active() const { return false; }
+
+				/**
+				 * Set the active status of the telemetry controller.
+				 *
+				 * This \a disabled implementation ignores the input \p _active.
+				 */
+				void inline set_active( bool ) {}
+
+				/**
+				 * Whether telemetry is compile-time active (never here).
+				 */
+				static constexpr bool enabled = false;
+			};
+
+			/**
+			 * Convenience definition fo an always-off telemetry controller.
+			 */
+			using TelemetryControllerAlwaysOff = TelemetryControllerBase< false >;
+
+			/**
+			 * Always active controller, useful especially for prototyping scenarios.
+			 */
+			class TelemetryControllerAlwaysOn {
+			public:
+				TelemetryControllerAlwaysOn( bool _enabled ) {
+					(void) _enabled;
+				}
+
+				TelemetryControllerAlwaysOn() = default;
+
+				TelemetryControllerAlwaysOn( const TelemetryControllerAlwaysOn & ) = default;
+
+				TelemetryControllerAlwaysOn& operator=( const TelemetryControllerAlwaysOn & ) = delete;
+
+				/**
+				 * Tells whether the controller is \a active, which is in this case always true.
+				*/
+				constexpr bool is_active() const { return true; }
+
+				/**
+				 * Set the active status of the telemetry controller.
+				 *
+				 * This \a disabled implementation ignores the input \p _active.
+				 */
+				void inline set_active( bool ) {}
+
+				/**
+				 * Whether telemetry is compile-time active (here always).
+				 */
+				static constexpr bool enabled = true;
+			};
+
+			/**
+			 * SFINAE-based structure to check whether \p T is a telemetry controller, i.e.
+			 *   - it has a \a constexpr static field named \a enabled
+			 *   - it has an \a is_active() method
+			 *   - it has a \a set_active(bool) method
+			 */
+			template< typename T > struct is_telemetry_controller {
+			private:
+				template< typename U > static constexpr bool has_enabled_field(
+					typename std::enable_if<
+						std::is_same< typename std::decay< decltype( U::enabled ) >::type, bool >::value,
+							bool * >::type
+					) {
+						return true;
+					}
+
+				template< typename U > static constexpr bool has_enabled_field( ... ) { return false; }
+
+				template< typename U > static constexpr bool has_is_active_method(
+					typename std::enable_if<
+						std::is_same< typename std::decay<decltype( std::declval< U >().is_active() )
+							>::type, bool >::value, bool * >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool has_is_active_method( ... ) { return false; }
+
+				template< typename U > static constexpr bool has_set_active_method(
+					typename std::enable_if<
+						std::is_same< decltype( std::declval< U >().set_active( true ) ), void >::value,
+						bool * >::type
+				) {
+					return true;
+				}
+
+				template< typename U > static constexpr bool has_set_active_method( ... ) { return false; }
+
+			public:
+				static constexpr bool value = has_enabled_field< T >( nullptr )
+					&& has_is_active_method< T >( nullptr ) && has_set_active_method< T >( nullptr ) ;
+			};
+		}
+
+	}
+}
+
+// Name of the Controller Enabler, i.e., a type that controls whether a telemetry controller is enabled
+#define __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) __ ## name ## _Enabler
+
+// Name of the Telemetry Controller type
+#define __TELEMETRY_CONTROLLER_NAME( name ) name ## _cls
+
+/**
+ * Defines a telemetry controller, i.e., a custom type derived from TelemetryControllerBase.
+ *
+ * This declaration requires the declaration of an associated controller enabler type, which controls
+ * whether the controller is enabled at compile-time; the controller is by default \b deactivated.
+ */
+#define DEFINE_TELEMETRY_CONTROLLER( name ) 											\
+	class __TELEMETRY_CONTROLLER_ENABLER_NAME( name ) {};								\
+	using name = class __TELEMETRY_CONTROLLER_NAME( name ) :							\
+		public grb::utils::telemetry::TelemetryControllerBase<							\
+			grb::utils::telemetry::is_controller_enabled<								\
+				__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() > {						\
+	public:																				\
+		using base_t = grb::utils::telemetry::TelemetryControllerBase<					\
+			grb::utils::telemetry::is_controller_enabled<								\
+				__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() >;						\
+		__TELEMETRY_CONTROLLER_NAME( name )( bool _enabled ) : base_t( _enabled ) {}	\
+	};
+
+/**
+ * Enables a telemetry controller through its associated enabler type.
+ *
+ * Once enabled, it can be runtime activated.
+ */
+#define ENABLE_TELEMETRY_CONTROLLER( name ) class __TELEMETRY_CONTROLLER_ENABLER_NAME( name );	\
+	namespace grb { namespace utils { namespace telemetry {										\
+		template<> constexpr bool is_controller_enabled<										\
+			__TELEMETRY_CONTROLLER_ENABLER_NAME( name ) >() { return true; } 					\
+	} } }
+
+#endif // _H_GRB_UTILS_TELEMETRY_TELEMETRY_CONTROLLER
diff --git a/include/graphblas/utils/telemetry/Timeable.hpp b/include/graphblas/utils/telemetry/Timeable.hpp
new file mode 100644
index 000000000..2ffb97723
--- /dev/null
+++ b/include/graphblas/utils/telemetry/Timeable.hpp
@@ -0,0 +1,129 @@
+
+/*
+ *   Copyright 2023 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file Timeable.hpp
+ * @author Alberto Scolari (alberto.scolar@huawei.com)
+ *
+ * Definition for the Timeable class.
+ */
+
+#ifndef _H_GRB_UTILS_TELEMETRY_TIMEABLE
+#define _H_GRB_UTILS_TELEMETRY_TIMEABLE
+
+#include "Stopwatch.hpp"
+
+namespace grb {
+	namespace utils {
+		namespace telemetry {
+
+			/**
+			 * Facility for inheriting classes that want to time interal operations:
+			 * this class provides protected methods to measure elapsed time and public methods to expose
+			 * elapsed time and allow resetting the internal elapsed time.
+			 *
+			 * @tparam TelControllerType type of telemetry controller
+			 * @tparam enabled whether telemetry is enabled
+			 */
+			template<
+				typename TelControllerType,
+				bool enabled = TelControllerType::enabled
+			> class Timeable {
+			public:
+				using self_t = Timeable< TelControllerType, enabled >;
+
+				Timeable( const TelControllerType & tt ) {
+					(void) tt;
+				}
+
+				Timeable( const self_t & ) = default;
+
+				Timeable& operator=( const self_t & ) = delete;
+
+				/**
+				 * Get the elapsed time, in nanoseconds.
+				 */
+				constexpr inline duration_nano_t getElapsedNano() const {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+				/**
+				 * Reset the internal value of elapsed time.
+				 */
+				constexpr inline duration_nano_t reset() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			protected:
+
+				/**
+				 * Starts measuring the elapsed time.
+				 */
+				inline void start() {}
+
+				/**
+				 * Stops measuring elapsed time.
+				 */
+				constexpr inline duration_nano_t stop() {
+					return static_cast< duration_nano_t >( 0 );
+				}
+
+			};
+
+			/**
+			 * Implementation of Timeable for enabled telemetry.
+			 *
+			 * @tparam TelControllerType type of telemetry controller.
+			 */
+			template< typename TelControllerType > class Timeable< TelControllerType, true > {
+			public:
+				using self_t = Timeable< TelControllerType, true >;
+
+				Timeable( const TelControllerType & tt ) : swatch( tt ) {}
+
+				Timeable( const self_t & ) = default;
+
+				Timeable& operator=( const self_t & ) = delete;
+
+				inline duration_nano_t getElapsedNano() const {
+					return swatch.getElapsedNano();
+				}
+
+				inline duration_nano_t reset() {
+					return swatch.reset();
+				}
+
+			protected:
+				inline void start() {
+					swatch.start();
+				}
+
+				inline duration_nano_t stop() {
+					return swatch.stop();
+				}
+
+			private:
+				Stopwatch< TelControllerType > swatch;
+			};
+
+			using StaticTimeable = Timeable< TelemetryControllerAlwaysOn, true >;
+
+		}
+	}
+}
+
+#endif // _H_GRB_UTILS_TELEMETRY_TIMEABLE
diff --git a/tests/smoke/hpcg.cpp b/tests/smoke/hpcg.cpp
index d84c157e0..e2f5644c2 100644
--- a/tests/smoke/hpcg.cpp
+++ b/tests/smoke/hpcg.cpp
@@ -1,6 +1,6 @@
 
 /*
- *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,404 +18,650 @@
 /**
  * @file hpcg_test.cpp
  * @author Alberto Scolari (alberto.scolari@huawei.com)
- * @brief Test for HPCG simulations on N-dimensional physical problems.
+ * Test for HPCG simulations on N-dimensional physical problems.
  *
  * This test strictly follows the parameter and the formulation of the reference HPCG
  * benchmark impementation in https://github.com/hpcg-benchmark/hpcg.
- *
- * @date 2021-04-30
  */
 
+#include <algorithm>
 #include <array>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
+#include <cstring>
+#include <iomanip>
 #include <iostream>
+#include <locale>
 #include <memory>
 #include <type_traits>
 
 #include <graphblas.hpp>
-#include <graphblas/algorithms/hpcg/hpcg.hpp>
 #include <graphblas/algorithms/hpcg/system_building_utils.hpp>
-
-// here we define a custom macro and do not use NDEBUG since the latter is not defined for smoke tests
-#ifdef HPCG_PRINT_STEPS
-
-// HPCG_PRINT_STEPS requires defining the following symbols
-
-/**
- * @brief simply prints \p args on a dedicated line.
- */
-#define DBG_println( args ) std::cout << args << std::endl;
-
-// forward declaration for the tracing facility
-template< typename T,
-	class Ring = grb::Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
->
-void print_norm( const grb::Vector< T > &r, const char * head, const Ring &ring = Ring() );
-
-/**
- * @brief prints \p head and the norm of \p r.
- */
-#define DBG_print_norm( vec, head ) print_norm( vec, head )
-#endif
-
-#include <graphblas/utils/Timer.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_building_utils.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_cg.hpp>
+#include <graphblas/algorithms/multigrid/multigrid_v_cycle.hpp>
+#include <graphblas/algorithms/multigrid/red_black_gauss_seidel.hpp>
+#include <graphblas/algorithms/multigrid/single_matrix_coarsener.hpp>
+#include <graphblas/utils/telemetry/Telemetry.hpp>
 
 #include <utils/argument_parser.hpp>
 #include <utils/assertions.hpp>
 #include <utils/print_vec_mat.hpp>
 
 //========== MAIN PROBLEM PARAMETERS =========
-// values modifiable via cmd line args: default set as in reference HPCG
-constexpr size_t PHYS_SYSTEM_SIZE_DEF{ 16UL };
-constexpr size_t PHYS_SYSTEM_SIZE_MIN{ 4UL };
-constexpr size_t DEF_COARSENING_LEVELS{ 1U };
-constexpr size_t MAX_COARSENING_LEVELS{ 4U };
-constexpr size_t MAX_ITERATIONS_DEF{ 56UL };
-constexpr size_t SMOOTHER_STEPS_DEF{ 1 };
-
-// internal values
-constexpr double SYSTEM_DIAG_VALUE { 26.0 };
-constexpr double SYSTEM_NON_DIAG_VALUE { -1.0 };
-constexpr size_t BAND_WIDTH_3D { 13UL };
-constexpr size_t HALO_RADIUS { 1U };
+// default simulation parameters, set as in reference HPCG
+// users can input different ones via the cmd line
+constexpr size_t PHYS_SYSTEM_SIZE_DEF = 16UL;
+constexpr size_t PHYS_SYSTEM_SIZE_MIN = 2UL;
+constexpr size_t MAX_COARSENING_LEVELS = 3UL;
+constexpr size_t MAX_ITERATIONS_DEF = 56UL;
+constexpr size_t SMOOTHER_STEPS_DEF = 1;
+
+// internal values defining the simulated physical system
+constexpr double SYSTEM_DIAG_VALUE = 26.0;
+constexpr double SYSTEM_NON_DIAG_VALUE = -1.0;
+constexpr size_t BAND_WIDTH_3D = 13UL;
+constexpr size_t HALO_RADIUS = 1U;
+constexpr double MAX_NORM = 4.0e-14;
 //============================================
 
-constexpr double MAX_NORM { 4.0e-14 };
-
 using namespace grb;
 using namespace algorithms;
 
 static const char * const TEXT_HIGHLIGHT = "===> ";
-#define thcout ( std::cout << TEXT_HIGHLIGHT )
-#define thcerr ( std::cerr << TEXT_HIGHLIGHT )
 
+// default types
+using value_t = double;
+
+struct HPCGTypes {
+	using IOType = value_t;
+	using NonzeroType = value_t;
+	using InputType = value_t;
+	using ResidualType = value_t;
+	using Ring = Semiring< grb::operators::add< NonzeroType >,grb::operators::mul< NonzeroType >,
+		grb::identities::zero, grb::identities::one >;
+	using Minus = operators::subtract< NonzeroType >;
+	using Divide = operators::divide< NonzeroType >;
+};
 
-/**
- * @brief Container for system parameters to create the HPCG problem.
- */
-struct system_input {
-	size_t nx, ny, nz;
-	size_t max_coarsening_levels;
+using IOType = typename HPCGTypes::IOType;
+using NonzeroType = typename HPCGTypes::NonzeroType;
+using InputType = typename HPCGTypes::InputType;
+using ResidualType = typename HPCGTypes::ResidualType;
+using Ring = typename HPCGTypes::Ring;
+
+using coord_t = size_t;
+
+constexpr Descriptor hpcg_desc = descriptors::dense;
+
+// telemetry control: controllers and output stream types for telemetry
+// they can be (de)activated at compile-time by (un)commenting the respective ENABLE_TELEMETRY_CONTROLLER() macro
+ENABLE_TELEMETRY_CONTROLLER( dist_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( dist_controller_t )
+using DistStream = grb::utils::telemetry::OutputStream< dist_controller_t >;
+
+ENABLE_TELEMETRY_CONTROLLER( hpcg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( hpcg_controller_t )
+
+ENABLE_TELEMETRY_CONTROLLER( mg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( mg_controller_t )
+
+// ENABLE_TELEMETRY_CONTROLLER( dbg_controller_t )
+DEFINE_TELEMETRY_CONTROLLER( dbg_controller_t )
+using DBGStream = grb::utils::telemetry::OutputStream< dbg_controller_t >;
+
+using duration_t = utils::telemetry::duration_nano_t;
+using hpcg_csv_t = utils::telemetry::CSVWriter< hpcg_controller_t, hpcg_controller_t::enabled,
+	size_t, duration_t >;
+using mg_csv_t = utils::telemetry::CSVWriter< mg_controller_t, mg_controller_t::enabled,
+	size_t, size_t, duration_t, duration_t >;
+
+// assembled types for simulation runners and input/output structures
+using smoother_runner_t = grb::algorithms::RedBlackGSSmootherRunner< HPCGTypes,
+	mg_controller_t, hpcg_desc >;
+using smoothing_data_t = typename smoother_runner_t::SmootherDataType;
+
+using coarsener_runner_t = grb::algorithms::SingleMatrixCoarsener< HPCGTypes,
+	mg_controller_t, hpcg_desc >;
+using coarsening_data_t = typename coarsener_runner_t::CoarseningDataType;
+
+using mg_runner_t = MultiGridRunner< HPCGTypes, smoother_runner_t, coarsener_runner_t,
+	mg_controller_t, hpcg_desc, DBGStream >;
+using mg_data_t = typename mg_runner_t::MultiGridInputType;
+
+using hpcg_runner_t = MultiGridCGRunner< HPCGTypes, mg_runner_t, hpcg_controller_t,
+	hpcg_desc, DBGStream >;
+using hpcg_data_t = typename hpcg_runner_t::HPCGInputType;
+
+// Stopwatch type, to measure various setup phases
+using Stw = utils::telemetry::ActiveStopwatch;
+
+
+// allow DBGStream to print grb::Vector's in a lazy way (i.e., no code generated if deactivated)
+struct dotter  {
+	const grb::Vector< IOType > & v;
+
+	ResidualType operator()() const {
+		Ring ring;
+		ResidualType r = 0;
+		grb::dot( r, v, v, ring );
+		return r;
+	}
 };
 
-/**
- * @brief Container for the parameters for the HPCG simulation.
- */
-struct simulation_input : public system_input {
-	size_t test_repetitions;
+static inline DBGStream & operator<<( DBGStream & stream, const grb::Vector< IOType > & v ) {
+	stream << std::setprecision( 7 );
+	return stream << DBGStream::makeLazy( dotter{ v } );
+}
+
+// various algebraic zeros
+static const IOType io_zero = Ring().template getZero< IOType >();
+static const NonzeroType nz_zero = Ring().template getZero< NonzeroType >();
+static const InputType input_zero = Ring().template getZero< InputType >();
+static const ResidualType residual_zero = Ring().template getZero< ResidualType >();
+
+// input/output structure (serializable for distributed execution),
+// with the parameters for the HPCG simulation
+static constexpr size_t MAX_CSV_PATH_LENGTH = 255;
+
+struct simulation_input {
+	// physical parameters for the multi-grid
+	size_t nx, ny, nz;
+	size_t max_coarsening_levels;
+	// solver options
+	bool use_average_coarsener;
+	size_t inner_test_repetitions;
 	size_t max_iterations;
 	size_t smoother_steps;
 	bool evaluation_run;
 	bool no_preconditioning;
+	// logging options: these are serializable for launcher invocation
+	std::array< char, MAX_CSV_PATH_LENGTH + 1 > hpcg_csv;
+	std::array< char, MAX_CSV_PATH_LENGTH + 1 > mg_csv;
+	bool hpcg_log;
+	bool mg_log;
+
+	simulation_input() {
+		hpcg_csv[ 0 ] = '\0';
+		mg_csv[ 0 ] = '\0';
+	}
+
+	simulation_input( const simulation_input & ) = default;
 };
 
-/**
- * @brief Containers for test outputs.
- */
 struct output {
-	RC error_code;
-	size_t test_repetitions;
-	size_t performed_iterations;
-	double residual;
+	RC error_code = SUCCESS;
+	size_t inner_test_repetitions = 0;
 	grb::utils::TimerResults times;
-	std::unique_ptr< PinnedVector< double > > pinnedVector;
-	double square_norm_diff;
-
-	output() {
-		error_code = SUCCESS;
-		test_repetitions = 0;
-		performed_iterations = 0;
-		residual = 0.0;
-	}
+	std::unique_ptr< PinnedVector< IOType > > pinnedVector;
+	NonzeroType square_norm_diff = nz_zero;
+	CGOutInfo< NonzeroType > cg_out = { 0, nz_zero };
 };
 
-/**
- * @brief Returns the closets power of 2 bigger or equal to \p n .
- */
-template< typename T = size_t >
-T static next_pow_2( T n ) {
-	static_assert( std::is_integral< T >::value, "Integral required." );
-	--n;
-	n |= ( n >> 1 );
-	for( unsigned i = 1; i <= sizeof( T ) * 4; i *= 2 ) {
-		const unsigned shift = static_cast< T >( 1U ) << i;
-		n |= ( n >> shift );
+#ifdef HPCG_PRINT_SYSTEM
+// routine to print the system matrices
+static void print_system( const std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	const std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels ) {
+	assert( spmd<>::nprocs() == 1 ); // distributed printin of system not implemented
+	print_matrix( system_levels[ 0 ]->A, 70, "A" );
+	for( size_t i = 0; i < coarsener_levels.size(); i++ ) {
+		print_matrix( coarsener_levels[ i ]->coarsening_matrix, 50, "COARSENING MATRIX" );
+		print_matrix( system_levels[ i + 1 ]->A, 50, "COARSER SYSTEM MATRIX" );
 	}
-	return n + 1;
 }
+#endif
 
 /**
- * @brief Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
- * @return RC grb::SUCCESS if the system initialization within GraphBLAS succeeded
+ * Allocates the data structure input to the various simulation steps (CG, multi-grid, coarsening, smoothing)
+ * for each level of the multi-grid. The input is the vector of system sizes \p mg_sizes, with sizes in
+ * monotonically \b decreasing order (finest system first).
+ *
+ * This routine is algorithm-agnositc, as long as the constructors of the data types meet the requirements
+ * explained in \ref multigrid_allocate_data().
  */
-static RC build_3d_system( std::unique_ptr< hpcg_data< double, double, double > > & holder, const system_input & in ) {
-	const std::array< size_t, 3 > physical_sys_sizes { in.nx, in.ny, in.nz };
-	struct hpcg_system_params< 3, double > params {
-		physical_sys_sizes, HALO_RADIUS, BAND_WIDTH_3D * 2 + 1, SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2
-	};
-
-	return build_hpcg_system< 3, double >( holder, params );
-}
-
-#ifdef HPCG_PRINT_SYSTEM
-static void print_system( const hpcg_data< double, double, double > & data ) {
-	print_matrix( data.A, 70, "A" );
-	multi_grid_data< double, double > * coarser = data.coarser_level;
-	while( coarser != nullptr ) {
-		print_matrix( coarser->coarsening_matrix, 50, "COARSENING MATRIX" );
-		print_matrix( coarser->A, 50, "COARSER SYSTEM MATRIX" );
-		coarser = coarser->coarser_level;
-	}
+static void allocate_system_structures(
+	std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
+	std::unique_ptr< hpcg_data_t > & cg_system_data,
+	const std::vector< size_t > & mg_sizes,
+	const mg_controller_t & mg_controller,
+	DistStream & logger
+) {
+	Stw timer;
+
+	hpcg_data_t * data = new hpcg_data_t( mg_sizes[ 0 ] );
+	cg_system_data = std::unique_ptr< hpcg_data_t >( data );
+	logger << "allocating data for the MultiGrid simulation...";
+	timer.start();
+	multigrid_allocate_data( system_levels, coarsener_levels, smoother_levels, mg_sizes, mg_controller );
+	logger << " time (ms) " << Stw::nano2Milli( timer.restart() ) << std::endl;
+
+	// zero all vectors
+	logger << "zeroing all vectors...";
+	grb::RC rc = data->init_vectors( io_zero );
+	ASSERT_RC_SUCCESS( rc );
+	std::for_each( system_levels.begin(), system_levels.end(),
+		[]( std::unique_ptr< mg_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
+	std::for_each( coarsener_levels.begin(), coarsener_levels.end(),
+		[]( std::unique_ptr< coarsening_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
+	std::for_each( smoother_levels.begin(), smoother_levels.end(),
+		[]( std::unique_ptr< smoothing_data_t > & s ) {
+		ASSERT_RC_SUCCESS( s->init_vectors( io_zero ) );
+	} );
+	logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
 }
-#endif
 
-#ifdef HPCG_PRINT_STEPS
-template< typename T,
-		class Ring = Semiring< grb::operators::add< T >, grb::operators::mul< T >, grb::identities::zero, grb::identities::one >
-	>
-void print_norm( const grb::Vector< T > & r, const char * head, const Ring & ring ) {
-	T norm;
-	RC ret = grb::dot( norm, r, r, ring ); // residual = r' * r;
-	(void)ret;
-	assert( ret == SUCCESS );
-	std::cout << ">>> ";
-	if( head != nullptr ) {
-		std::cout << head << ": ";
+/**
+ * Builds and initializes a 3D system for an HPCG simulation according to the given 3D system sizes.
+ * It allocates the data structures and populates them according to the algorithms chosen for HPCG.
+ */
+static void build_3d_system(
+	std::vector< std::unique_ptr< mg_data_t > > & system_levels,
+	std::vector< std::unique_ptr< coarsening_data_t > > & coarsener_levels,
+	std::vector< std::unique_ptr< smoothing_data_t > > & smoother_levels,
+	std::unique_ptr< hpcg_data_t > & cg_system_data,
+	const simulation_input & in,
+	const mg_controller_t & tt,
+	DistStream & logger
+) {
+	constexpr size_t DIMS = 3;
+	using builder_t = grb::algorithms::HPCGSystemBuilder< DIMS, coord_t, NonzeroType >;
+	Stw timer;
+
+	HPCGSystemParams< DIMS, NonzeroType > params = { { in.nx, in.ny, in.nz }, HALO_RADIUS,
+		SYSTEM_DIAG_VALUE, SYSTEM_NON_DIAG_VALUE, PHYS_SYSTEM_SIZE_MIN, in.max_coarsening_levels, 2 };
+
+	std::vector< builder_t > mg_generators;
+	logger << "building HPCG generators for " << ( in.max_coarsening_levels + 1 ) << " levels...";
+	timer.start();
+	// construct the builder_t generator for each grid level, which depends on the system physics
+	hpcg_build_multigrid_generators( params, mg_generators );
+	logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
+	logger << "built HPCG generators for " << mg_generators.size() << " levels" << std::endl;
+
+	// extract the size for each level
+	std::vector< size_t > mg_sizes;
+	std::transform( mg_generators.cbegin(), mg_generators.cend(), std::back_inserter( mg_sizes ),
+		[]( const builder_t & b ) {
+		return b.system_size();
+	} );
+	// given the sizes, allocate the data structures for all the inputs of the algorithms
+	allocate_system_structures( system_levels, coarsener_levels, smoother_levels,
+		cg_system_data, mg_sizes, tt, logger );
+	assert( mg_generators.size() == system_levels.size() );
+	assert( mg_generators.size() == smoother_levels.size() );
+	assert( mg_generators.size() - 1 == coarsener_levels.size() ); // coarsener acts between two levels
+
+	// for each grid level, populate the data structures according to the specific algorithm
+	// and track the time for diagnostics purposes
+	for( size_t i = 0; i < mg_generators.size(); i++ ) {
+		logger << "SYSTEM LEVEL " << i << std::endl;
+		auto & sizes = mg_generators[ i ].get_generator().get_sizes();
+		logger << " sizes: ";
+		for( size_t s = 0; s < DIMS - 1; s++ ) {
+			logger << sizes[ s ] << " x ";
+		}
+		logger << sizes[ DIMS - 1 ] << std::endl;
+		logger << " populating system matrix: ";
+		timer.start();
+		grb::RC rc = hpcg_populate_system_matrix( mg_generators[ i ],
+			system_levels.at( i )->A, logger );
+		ASSERT_RC_SUCCESS( rc );
+		logger << " time (ms) " << Stw::nano2Milli( timer.restart() ) << std::endl;
+
+		logger << " populating smoothing data: ";
+		rc = hpcg_populate_smoothing_data( mg_generators[ i ], *smoother_levels[ i ],
+			logger );
+		logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
+		ASSERT_RC_SUCCESS( rc );
+
+		if( i > 0 ) {
+			logger << " populating coarsening data: ";
+			timer.start();
+			if( ! in.use_average_coarsener ) {
+				rc = hpcg_populate_coarsener( mg_generators[ i - 1 ], mg_generators[ i ],
+					*coarsener_levels[ i - 1 ] );
+			} else {
+				rc = hpcg_populate_coarsener_avg( mg_generators[ i - 1 ], mg_generators[ i ],
+					*coarsener_levels[ i - 1 ] );
+			}
+			logger << " time (ms) " << Stw::nano2Milli( timer.stop() ) << std::endl;
+			ASSERT_RC_SUCCESS( rc );
+		}
 	}
-	std::cout << norm << std::endl;
 }
-#endif
 
 /**
- * @brief Main test, building an HPCG problem and running the simulation closely following the
+ * Main test, building an HPCG problem and running the simulation closely following the
  * parameters in the reference HPCG test.
  */
 void grbProgram( const simulation_input & in, struct output & out ) {
 	// get user process ID
-	assert( spmd<>::pid() < spmd<>::nprocs() );
-	grb::utils::Timer timer;
-	timer.reset();
+	const size_t pid = spmd<>::pid();
+	Stw timer;
+
+	// standard logger: active only on master node
+	dist_controller_t dist( pid == 0 );
+	// separate thousands when printing integers
+	class IntegerSeparation : public std::numpunct< char > {
+		char do_thousands_sep() const override {
+			return '\'';
+		}
+		std::string do_grouping() const override {
+			return "\03";
+		}
+	};
+	std::locale old_locale = std::cout.imbue( std::locale( std::cout.getloc(), new IntegerSeparation ) );
+	DistStream logger( dist, std::cout );
 
-	// assume successful run
-	out.error_code = SUCCESS;
-	RC rc { SUCCESS };
+	logger << "beginning input generation..." << std::endl;
 
 	// wrap hpcg_data inside a unique_ptr to forget about cleaning chores
-	std::unique_ptr< hpcg_data< double, double, double > > hpcg_state;
-	rc = build_3d_system( hpcg_state, in );
+	std::unique_ptr< hpcg_data_t > hpcg_state;
+
+	// measure HPCG execution time by default on master
+	hpcg_controller_t hpcg_controller( pid == 0 );
+	// measure MG and smoother only if the user requested it
+	mg_controller_t mg_controller( pid == 0 && in.mg_log );
+
+	// trace execution of CG and MG only on master
+	dbg_controller_t dbg_controller( pid == 0 );
+	DBGStream dbg_stream( dbg_controller, std::cout );
+
+	// define the main runners and initialize the options of its components
+	coarsener_runner_t coarsener;
+	smoother_runner_t smoother;
+	smoother.presmoother_steps = smoother.postsmoother_steps = in.smoother_steps;
+	smoother.non_recursive_smooth_steps = 1UL;
+	mg_runner_t mg_runner( smoother, coarsener, dbg_stream );
+	hpcg_runner_t hpcg_runner( hpcg_controller, mg_runner, dbg_stream );
+	hpcg_runner.tolerance = residual_zero;
+	hpcg_runner.with_preconditioning = ! in.no_preconditioning;
+
+	timer.start();
+	// build the entire multi-grid system
+	build_3d_system( mg_runner.system_levels, coarsener.coarsener_levels, smoother.levels,
+		hpcg_state, in, mg_controller, logger );
+	logger << "input generation time (ms): " << Stw::nano2Milli( timer.restart() ) << std::endl;
 
-	if( rc != SUCCESS ) {
-		std::cerr << "Failure to generate the system (" << toString( rc ) << ")." << std::endl;
-		out.error_code = rc;
-		return;
-	}
 #ifdef HPCG_PRINT_SYSTEM
-	if( spmd<>::pid() == 0 ) {
-		print_system( *hpcg_state );
+	if( pid == 0 ) {
+		print_system( mg_runner.system_levels, coarsener.coarsener_levels );
 	}
 #endif
 
-	Matrix< double > & A { hpcg_state->A };
-	Vector< double > & x { hpcg_state->x };
-	Vector< double > & b { hpcg_state->b };
+	Matrix< NonzeroType > & A = mg_runner.system_levels[ 0 ]->A;
+	Vector< IOType > & x = hpcg_state->x;
+	Vector< NonzeroType > & b = hpcg_state->b;
 
 	// set vectors as from standard HPCG benchmark
-	set( x, 1.0 );
-	set( b, 0.0 );
-	rc = grb::mxv( b, A, x, grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one >() );
-	set( x, 0.0 );
+	RC rc = set( x, 1.0 );
+	ASSERT_RC_SUCCESS( rc );
+	rc = set( b, nz_zero );
+	ASSERT_RC_SUCCESS( rc );
+	rc = grb::mxv( b, A, x, Ring() );
+	ASSERT_RC_SUCCESS( rc );
+	rc = set( x, io_zero );
+	ASSERT_RC_SUCCESS( rc );
 
 #ifdef HPCG_PRINT_SYSTEM
-	if( spmd<>::pid() == 0 ) {
+	if( pid == 0 ) {
 		print_vector( x, 50, "X" );
 		print_vector( b, 50, "B" );
 	}
 #endif
 
-	out.times.preamble = timer.time();
+	out.times.preamble = Stw::nano2Milli( timer.restart() );
 
-	const bool with_preconditioning = ! in.no_preconditioning;
-	if( in.evaluation_run ) {
-		out.test_repetitions = 0;
-		timer.reset();
-		rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-		double single_time = timer.time();
-		if( rc == SUCCESS ) {
-			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
-		}
-		out.times.useful = single_time;
-		out.test_repetitions = static_cast< size_t >( 1000.0 / single_time ) + 1;
-	} else {
-		// do benchmark
-		timer.reset();
-		for( size_t i = 0; i < in.test_repetitions && rc == SUCCESS; ++i ) {
-			rc = set( x, 0.0 );
-			assert( rc == SUCCESS );
-			rc = hpcg( *hpcg_state, with_preconditioning, in.smoother_steps, in.smoother_steps, in.max_iterations, 0.0, out.performed_iterations, out.residual );
-			out.test_repetitions++;
-			if( rc != SUCCESS ) {
-				break;
-			}
-		}
-		double time_taken { timer.time() };
-		out.times.useful = time_taken / static_cast< double >( out.test_repetitions );
-		// sleep( 1 );
-	}
+	mg_data_t & grid_base = *mg_runner.system_levels[ 0 ];
 
-	if( spmd<>::pid() == 0 ) {
-		if( rc == SUCCESS ) {
-			if( in.evaluation_run ) {
-				std::cout << "Info: cold HPCG completed within " << out.performed_iterations << " iterations. Last computed residual is " << out.residual << ". Time taken was " << out.times.useful
-						  << " ms. Deduced inner repetitions parameter of " << out.test_repetitions << " to take 1 second or more per inner benchmark." << std::endl;
-			} else {
-				std::cout << "Average time taken for each of " << out.test_repetitions << " HPCG calls (hot start): " << out.times.useful << std::endl;
-			}
-		} else {
-			std::cerr << "Failure: call to HPCG did not succeed (" << toString( rc ) << ")." << std::endl;
+	// do a cold run to warm the system up
+	logger << TEXT_HIGHLIGHT << "beginning cold run..." << std::endl;
+	hpcg_runner.max_iterations = 1;
+	rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
+	logger << " time (ms): " << Stw::nano2Milli( timer.restart() ) << std::endl;
+	ASSERT_RC_SUCCESS( rc );
+
+	// restore CG options to user-given values
+	hpcg_runner.max_iterations = in.max_iterations;
+	logger << TEXT_HIGHLIGHT << "beginning solver..." << std::endl;
+	out.inner_test_repetitions = 0;
+	out.times.useful = 0.0;
+
+	// initialize CSV writers (if activated)
+	hpcg_csv_t hpcg_csv( hpcg_controller, { "repetition", "time" } );
+	mg_csv_t mg_csv( mg_controller, { "repetition", "level", "mg time", "smoother time" } );
+	timer.reset();
+
+	// do benchmark
+	for( size_t i = 0; i < in.inner_test_repetitions; ++i ) {
+		rc = set( x, io_zero );
+		ASSERT_RC_SUCCESS( rc );
+		logger << TEXT_HIGHLIGHT << "beginning iteration: " << i << std::endl;
+		rc = hpcg_runner( grid_base, *hpcg_state, out.cg_out );
+		ASSERT_RC_SUCCESS( rc );
+		hpcg_csv.add_line( i, hpcg_runner.getElapsedNano() );
+		logger << "repetition,duration (ns): " << hpcg_csv.last_line() << std::endl;
+		for( const auto & mg_level : mg_runner.system_levels ) {
+			mg_csv.add_line( i, mg_level->level, mg_level->mg_stopwatch.getElapsedNano(),
+				mg_level->sm_stopwatch.getElapsedNano() );
+			mg_level->mg_stopwatch.reset();
+			mg_level->sm_stopwatch.reset();
 		}
+		hpcg_runner.reset();
+
+		out.inner_test_repetitions++;
+	}
+	timer.stop();
+	out.times.useful += Stw::nano2Milli( timer.getElapsedNano() );
+	if( in.evaluation_run ) {
+		// get maximum execution time among processes
+		rc = collectives<>::reduce( out.times.useful, 0, operators::max< double >() );
+		return;
 	}
+	out.times.useful /= static_cast< double >( in.inner_test_repetitions );
+
+	logger << TEXT_HIGHLIGHT << "repetitions,average time (ms): " << out.inner_test_repetitions
+		<< ", " << out.times.useful << std::endl;
+	// restore previous output options
+	std::cout.imbue( old_locale );
 
 	// start postamble
-	timer.reset();
-	// set error code
+	timer.restart();
+	// set error code to caller
 	out.error_code = rc;
 
-	Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one > ring;
 	grb::set( b, 1.0 );
-	out.square_norm_diff = 0.0;
-	grb::eWiseMul( b, -1.0, x, ring );
-	grb::dot( out.square_norm_diff, b, b, ring );
+	grb::eWiseMul( b, -1.0, x, Ring() );
+	out.square_norm_diff = nz_zero;
+	grb::dot( out.square_norm_diff, b, b, Ring() );
 
 	// output
-	out.pinnedVector = std::unique_ptr< PinnedVector< double > >( new PinnedVector< double >( x, SEQUENTIAL ) );
+	out.pinnedVector.reset( new PinnedVector< NonzeroType >( x, SEQUENTIAL ) );
 	// finish timing
-	const double time_taken { timer.time() };
-	out.times.postamble = time_taken;
+
+	// write measurements into CSV files
+	if( in.hpcg_log ) {
+		hpcg_csv.write_to_file( in.hpcg_csv.data() );
+	}
+	if( in.mg_log ) {
+		mg_csv.write_to_file( in.mg_csv.data() );
+	}
+	out.times.postamble = Stw::nano2Milli( timer.stop() );
 }
 
+#define thcout ( std::cout << TEXT_HIGHLIGHT )
+
 /**
- * @brief Parser the command-line arguments to extract the simulation information and checks they are valid.
+ * Parser the command-line arguments to extract the simulation information and checks they are valid.
  */
 static void parse_arguments( simulation_input &, size_t &, double &, int, char ** );
 
 int main( int argc, char ** argv ) {
 	simulation_input sim_in;
 	size_t test_outer_iterations;
-	double max_residual_norm;
+	double max_diff_norm;
 
-	parse_arguments( sim_in, test_outer_iterations, max_residual_norm, argc, argv );
+	parse_arguments( sim_in, test_outer_iterations, max_diff_norm, argc, argv );
 	thcout << "System size x: " << sim_in.nx << std::endl;
 	thcout << "System size y: " << sim_in.ny << std::endl;
 	thcout << "System size z: " << sim_in.nz << std::endl;
+	thcout << "Coarsener: " << ( sim_in.use_average_coarsener ? "average" :
+		"single point sampler" ) << std::endl;
 	thcout << "System max coarsening levels " << sim_in.max_coarsening_levels << std::endl;
-	thcout << "Test repetitions: " << sim_in.test_repetitions << std::endl;
+	thcout << "Test repetitions: " << sim_in.inner_test_repetitions << std::endl;
 	thcout << "Max iterations: " << sim_in.max_iterations << std::endl;
-	thcout << "Direct launch: " << std::boolalpha << sim_in.evaluation_run << std::noboolalpha << std::endl;
-	thcout << "No conditioning: " << std::boolalpha << sim_in.no_preconditioning << std::noboolalpha << std::endl;
+	thcout << "Is evaluation run: " << std::boolalpha << sim_in.evaluation_run
+		<< std::noboolalpha << std::endl;
+	thcout << "Conditioning: " << std::boolalpha << !sim_in.no_preconditioning
+		<< std::noboolalpha << std::endl;
 	thcout << "Smoother steps: " << sim_in.smoother_steps << std::endl;
 	thcout << "Test outer iterations: " << test_outer_iterations << std::endl;
-	thcout << "Maximum norm for residual: " << max_residual_norm << std::endl;
+	thcout << "Maximum norm for residual: " << max_diff_norm << std::endl;
 
 	// the output struct
 	struct output out;
 
 	// set standard exit code
-	grb::RC rc { SUCCESS };
+	grb::RC rc = SUCCESS;
 
 	// launch estimator (if requested)
 	if( sim_in.evaluation_run ) {
 		grb::Launcher< AUTOMATIC > launcher;
+		// run just one inner iteration for evaluation purposes
+		sim_in.inner_test_repetitions = 1;
+		thcout << "beginning evaluation run..." << std::endl;
 		rc = launcher.exec( &grbProgram, sim_in, out, true );
-		if( rc == SUCCESS ) {
-			sim_in.test_repetitions = out.test_repetitions;
-		} else {
-			thcout << "launcher.exec returns with non-SUCCESS error code " << grb::toString( rc ) << std::endl;
-			std::exit( -1 );
-		}
+		ASSERT_RC_SUCCESS( rc );
+		ASSERT_EQ( out.inner_test_repetitions, 1 );
+		// compute number of inner repetitions to achieve at least 1s duration
+		sim_in.inner_test_repetitions = static_cast< size_t >( 1000.0 / out.times.useful ) + 1;
+		thcout << "Evaluation run" << std::endl
+				<< "  computed residual: " << out.cg_out.norm_residual << std::endl
+				<< "  iterations: " << out.cg_out.iterations << std::endl
+				<< "  time taken (ms): " << out.times.useful << std::endl
+				<< "  deduced inner repetitions for 1s duration: " << sim_in.inner_test_repetitions
+				<< std::endl;
 	}
 
 	// launch full benchmark
 	grb::Benchmarker< AUTOMATIC > benchmarker;
+	thcout << "beginning test run..." << std::endl;
 	rc = benchmarker.exec( &grbProgram, sim_in, out, 1, test_outer_iterations, true );
 	ASSERT_RC_SUCCESS( rc );
-	thcout << "Benchmark completed successfully and took " << out.performed_iterations << " iterations to converge with residual " << out.residual << std::endl;
-
-	if( ! out.pinnedVector ) {
-		thcerr << "no output vector to inspect" << std::endl;
-	} else {
-		const PinnedVector< double > &solution { *out.pinnedVector };
-		thcout << "Size of x is " << solution.size() << std::endl;
-		if( solution.size() > 0 ) {
-			print_vector( solution, 30, "SOLUTION" );
-		} else {
-			thcerr << "ERROR: solution contains no values" << std::endl;
-		}
-	}
-
 	ASSERT_RC_SUCCESS( out.error_code );
-
-	double residual_norm { sqrt( out.square_norm_diff ) };
-	thcout << "Residual norm: " << residual_norm << std::endl;
-
-	ASSERT_LT( residual_norm, max_residual_norm );
+	thcout << "completed successfully!" << std::endl
+		   << "  final residual: " << out.cg_out.norm_residual << std::endl
+		   << "  solver iterations: " << out.cg_out.iterations << std::endl
+		   << "  total time (ms): " << out.times.useful << std::endl;
+
+	// check result vector, stored inside a pinned vector
+	ASSERT_TRUE( out.pinnedVector );
+	const PinnedVector< double > & solution = *out.pinnedVector;
+	ASSERT_EQ( solution.size(), sim_in.nx * sim_in.ny * sim_in.nz );
+
+	// check norm of solution w.r.t. expected solution (i.e. vector of all 1)
+	double diff_norm = sqrt( out.square_norm_diff );
+	thcout << "Norm of difference vector: |<exact solution> - <actual solution>| = "
+		<< diff_norm << std::endl;
+	ASSERT_LT( diff_norm, max_diff_norm );
 
 	thcout << "Test OK" << std::endl;
 	return 0;
 }
 
-static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations, double & max_residual_norm, int argc, char ** argv ) {
+static const char * const empty = "";
 
+static void parse_arguments( simulation_input & sim_in, size_t & outer_iterations,
+	double & max_diff_norm, int argc, char ** argv ) {
 	argument_parser parser;
+	const char *hpcg_csv, *mg_csv;
+
 	parser.add_optional_argument( "--nx", sim_in.nx, PHYS_SYSTEM_SIZE_DEF, "physical system size along x" )
 		.add_optional_argument( "--ny", sim_in.ny, PHYS_SYSTEM_SIZE_DEF, "physical system size along y" )
 		.add_optional_argument( "--nz", sim_in.nz, PHYS_SYSTEM_SIZE_DEF, "physical system size along z" )
-		.add_optional_argument( "--max_coarse-levels", sim_in.max_coarsening_levels, DEF_COARSENING_LEVELS,
-			"maximum level for coarsening; 0 means no coarsening; note: actual "
-			"level may be limited"
+		.add_optional_argument( "--max-coarse-levels", sim_in.max_coarsening_levels, MAX_COARSENING_LEVELS,
+			"maximum level for coarsening; 0 means no coarsening; note: actual level may be limited"
 			" by the minimum system dimension" )
-		.add_optional_argument( "--test-rep", sim_in.test_repetitions, grb::config::BENCHMARKING::inner(), "consecutive test repetitions before benchmarking" )
-		.add_optional_argument( "--init-iter", outer_iterations, grb::config::BENCHMARKING::outer(), "test repetitions with complete initialization" )
-		.add_optional_argument( "--max_iter", sim_in.max_iterations, MAX_ITERATIONS_DEF, "maximum number of HPCG iterations" )
-		.add_optional_argument( "--max-residual-norm", max_residual_norm, MAX_NORM,
-			"maximum norm for the residual to be acceptable (does NOT limit "
-			"the execution of the algorithm)" )
-		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF, "number of pre/post-smoother steps; 0 disables smoothing" )
+		.add_optional_argument( "--inner-iterations", sim_in.inner_test_repetitions, 1,
+			"consecutive test repetitions before benchmarking" )
+		.add_optional_argument( "--outer-iterations", outer_iterations, 1,
+			"test repetitions with complete initialization" )
+		.add_optional_argument( "--max-cg-iterations", sim_in.max_iterations, MAX_ITERATIONS_DEF,
+			"maximum number of CG iterations" )
+		.add_optional_argument( "--max-difference-norm", max_diff_norm, MAX_NORM, "maximum acceptable"
+			" norm |<exact solution> - <actual solution>| (does NOT limit the execution of the algorithm)" )
+		.add_optional_argument( "--smoother-steps", sim_in.smoother_steps, SMOOTHER_STEPS_DEF,
+			"number of pre/post-smoother steps; 0 disables smoothing" )
 		.add_option( "--evaluation-run", sim_in.evaluation_run, false,
-			"launch single run directly, without benchmarker (ignore "
-			"repetitions)" )
-		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false, "do not apply pre-conditioning via multi-grid V cycle" );
+			"launch single run directly, without benchmarker (ignore repetitions)" )
+		.add_option( "--no-preconditioning", sim_in.no_preconditioning, false,
+			"do not apply pre-conditioning via multi-grid V cycle" )
+		.add_optional_argument( "--hpcg-csv", hpcg_csv, empty,
+			"file for HPCG run measurements (overwrites any previous)" )
+		.add_optional_argument( "--mg-csv", mg_csv, empty,
+			"file for Multigrid run measurements (overwrites any previous)" )
+		.add_option( "--use-average-coarsener", sim_in.use_average_coarsener, false,
+			"coarsen by averaging instead of by sampling a single point (slower, but more accurate)" );
 
 	parser.parse( argc, argv );
 
-	// check for valid values
-	size_t ssize { std::max( next_pow_2( sim_in.nx ), PHYS_SYSTEM_SIZE_MIN ) };
-	if( ssize != sim_in.nx ) {
-		std::cout << "Setting system size x to " << ssize << " instead of " << sim_in.nx << std::endl;
-		sim_in.nx = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.ny ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.ny ) {
-		std::cout << "Setting system size y to " << ssize << " instead of " << sim_in.ny << std::endl;
-		sim_in.ny = ssize;
-	}
-	ssize = std::max( next_pow_2( sim_in.nz ), PHYS_SYSTEM_SIZE_MIN );
-	if( ssize != sim_in.nz ) {
-		std::cout << "Setting system size z to " << ssize << " instead of " << sim_in.nz << std::endl;
-		sim_in.nz = ssize;
-	}
 	if( sim_in.max_coarsening_levels > MAX_COARSENING_LEVELS ) {
-		std::cout << "Setting max coarsening level to " << MAX_COARSENING_LEVELS << " instead of " << sim_in.max_coarsening_levels << std::endl;
-		sim_in.max_coarsening_levels = MAX_COARSENING_LEVELS;
+		std::cerr << "ERROR: max coarsening level is " << sim_in.max_coarsening_levels <<
+			"; at most " << MAX_COARSENING_LEVELS << " is allowed" << std::endl;
+		std::exit( -1 );
 	}
-	if( sim_in.test_repetitions == 0 ) {
-		std::cerr << "ERROR no test runs selected: set \"--test-rep >0\"" << std::endl;
+	if( sim_in.inner_test_repetitions == 0 ) {
+		std::cerr << "ERROR no test runs selected: set \"--inner-iterations\" > 0" << std::endl;
 		std::exit( -1 );
 	}
 	if( sim_in.max_iterations == 0 ) {
-		std::cout << "Setting number of iterations to 1" << std::endl;
-		sim_in.max_iterations = 1;
+		std::cerr << "ERROR no CG iterations selected: set \"--max-cg-iterations > 0\"" << std::endl;
+		std::exit( -1 );
+	}
+
+	// check sizes
+	const size_t max_system_divider = 1 << sim_in.max_coarsening_levels;
+	for( size_t s : { sim_in.nx, sim_in.ny, sim_in.nz } ) {
+		std::lldiv_t div_res = std::div( static_cast< long long >( s ),
+			static_cast< long long >( max_system_divider ) );
+		if( div_res.rem != 0 ) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened " << sim_in.max_coarsening_levels
+				<< " times because it is not exactly divisible" << std::endl;
+			std::exit( -1 );
+		}
+		if( div_res.quot < static_cast< long long >( PHYS_SYSTEM_SIZE_MIN ) ) {
+			std::cerr << "ERROR: system size " << s << " cannot be coarsened " << sim_in.max_coarsening_levels
+				<< " times because it is too small" << std::endl;
+			std::exit( -1 );
+		}
+		if( div_res.quot % 2 != 0 ) {
+			std::cerr << "ERROR: the coarsest size " << div_res.rem << " is not even" << std::endl;
+			std::exit( -1 );
+		}
 	}
-}
 
+	// check output CSV file names
+	size_t len = std::strlen( hpcg_csv );
+	if( ( sim_in.hpcg_log = len > 0 ) ) {
+		if( len > MAX_CSV_PATH_LENGTH ) {
+			std::cerr << "HPCG CSV file name is too long!" << std::endl;
+			std::exit( -1 );
+		}
+		std::strncpy( sim_in.hpcg_csv.data(), hpcg_csv, MAX_CSV_PATH_LENGTH );
+	}
+	len = std::strlen( mg_csv );
+	if( ( sim_in.mg_log = len > 0 ) ) {
+		if( len > MAX_CSV_PATH_LENGTH ) {
+			std::cerr << "HPCG CSV file name is too long!" << std::endl;
+			std::exit( -1 );
+		}
+		std::strncpy( sim_in.mg_csv.data(), mg_csv, MAX_CSV_PATH_LENGTH );
+	}
+}
diff --git a/tests/utils/matrix_generators.hpp b/tests/utils/matrix_generators.hpp
index be45890c6..65fe789be 100644
--- a/tests/utils/matrix_generators.hpp
+++ b/tests/utils/matrix_generators.hpp
@@ -35,6 +35,7 @@
 #include <iterator>
 #include <algorithm>
 
+#include <graphblas/utils/iterators/utils.hpp>
 
 namespace grb {
 
@@ -114,28 +115,6 @@ namespace grb {
 
 		namespace internal {
 
-			/**
-			 * Computes the difference between \a a and \a b and returns it as the given
-			 * type \a DiffT.
-			 *
-			 * Raises an exception if \a DiffT cannot store the difference.
-			 */
-			template<
-				typename SizeT,
-				typename DiffT
-			>
-			DiffT compute_distance(
-				const SizeT a,
-				const SizeT b
-			) {
-				const SizeT diff = std::max( a, b ) - std::min( a, b );
-				if( diff > static_cast< SizeT >( std::numeric_limits< DiffT >::max() ) ) {
-					throw std::range_error( "cannot represent difference" );
-				}
-				DiffT result = static_cast< DiffT >( diff );
-				return a >= b ? result : -result ;
-			}
-
 			/**
 			 * Stores the coordinate for a generator of diagonal matrices.
 			 */
@@ -240,9 +219,8 @@ namespace grb {
 				typename SelfType::difference_type operator-(
 					const SelfType &other
 				) const {
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this->_v.coord, other._v.coord );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this->_v.coord, other._v.coord );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }
@@ -461,9 +439,8 @@ namespace grb {
 					const size_t this_position = coords_to_linear( _v.size, _v.row, _v.col );
 					const size_t other_position =
 						coords_to_linear( other._v.size, other._v.row, other._v.col );
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this_position, other_position );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this_position, other_position );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }
@@ -584,9 +561,8 @@ namespace grb {
 				typename SelfType::difference_type operator-(
 					const SelfType &other
 				) const {
-					return internal::compute_distance<
-						size_t, typename SelfType::difference_type
-					>( this->_v.offset, other._v.offset );
+					return compute_signed_distance< typename SelfType::difference_type,
+						size_t >( this->_v.offset, other._v.offset );
 				}
 
 				typename SelfType::pointer operator->() { return &_v; }