diff --git a/include/graphblas/algorithms/simulated_annealing_re.hpp b/include/graphblas/algorithms/simulated_annealing_re.hpp
new file mode 100644
index 000000000..21f9d90fb
--- /dev/null
+++ b/include/graphblas/algorithms/simulated_annealing_re.hpp
@@ -0,0 +1,810 @@
+/*
+ *   Copyright 2025 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides a Simulated Annealing-Replica Exchange QUBO optimizator.
+ *
+ * @author Giovanni Gaio
+ * @date TODO 2025
+ */
+
+#ifndef _H_GRB_ALGORITHMS_SA_RE
+#define _H_GRB_ALGORITHMS_SA_RE
+
+#include <vector>
+#include <type_traits>
+#include <tuple>
+#include <random>
+#include <algorithm>
+#include <cstdlib>
+#include <cmath>
+
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+#include <graphblas.hpp>
+
+#define ISCLOSE(a,b) (std::abs((b)-(a))/std::abs(a) < 1e-4) || (std::abs((b)-(a)) < 1e-4)
+
+namespace grb {
+	namespace algorithms {
+
+		/*
+		 * Do a Parallel Tempering pass.
+		 * This means exchanging states at low temperature with states at higher temperature.
+		 *
+		 * TODO: Complete this documentation.
+		 *
+		 * @param[in,out] states        On input: initial states.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     Inverse temperature of each state.
+		 * 							The betas may be permuted.
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 *
+		 * This implementation of parallel tempering does not use any spmd characteristics.
+		 */
+		template<
+			Backend backend,
+			typename StateType, 
+			typename EnergyType,
+			typename TempType
+			>
+	typename std::enable_if<
+		(grb::_GRB_BACKEND != grb::BSP1D) || (backend == grb::BSP1D),
+		grb::RC >::type
+	pt(
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				const grb::Vector< TempType, backend > &betas,
+				const int seed = 42
+				){
+
+			const size_t n_replicas = states.size();
+			// const size_t s 		= spmd<>::pid();
+			// const size_t nprocs = spmd<>::nprocs();
+			grb::RC rc = grb::SUCCESS;
+			std::minstd_rand rng ( seed );
+			std::exponential_distribution< EnergyType > rand ( 1.0 );
+
+			for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
+				const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
+
+				if( -rand( rng ) < de ){
+					std::swap( states[i], states[i-1] );
+					std::swap( energies[i], energies[i-1] );
+				}
+			}
+
+			return rc;
+		}
+
+		/*
+		 * Implementation of parallel tempering using spmd.
+		 */
+		template<
+			Backend backend,
+			typename StateType, 
+			typename EnergyType,
+			typename TempType
+			>
+			typename std::enable_if<
+				(grb::_GRB_BACKEND == grb::BSP1D) && (backend != grb::BSP1D),
+				grb::RC >::type
+		pt(
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				const grb::Vector< TempType, backend > &betas,
+				const int seed = 42
+				){
+			static_assert( backend != grb::BSP1D );
+			// static_assert( grb::_GRB_BACKEND == grb::BSP1D );
+
+			const size_t n = grb::size( states[0] );
+			const size_t n_replicas = states.size();
+			const size_t s 		= spmd<>::pid();
+			const size_t nprocs = spmd<>::nprocs();
+			grb::RC rc = grb::SUCCESS;
+
+#ifndef NDEBUG
+			assert( grb::size(energies) == n_replicas );
+			assert( grb::size(betas) == n_replicas );
+#endif
+			struct data {
+					EnergyType e;
+					TempType b;
+					EnergyType r;
+				};
+			// TODO: should these two be static? Probably.
+			grb::Vector< StateType, backend > s0 ( n );
+			grb::Vector< StateType, backend > s1 ( n );
+			grb::set( s0, static_cast< StateType >( 0 ) );
+			grb::set( s1, static_cast< StateType >( 0 ) );
+
+			struct data msg[ 2 ];
+			rc = rc ? rc : grb::resize( s0, n );
+			rc = rc ? rc : grb::resize( s1, n );
+			if( rc != grb::SUCCESS ) return rc;
+
+			std::minstd_rand rng;
+			std::exponential_distribution< EnergyType > rand ( 1.0 );
+
+			rng.seed( seed + s );
+			const EnergyType myrand = -rand( rng );
+
+			for( size_t si = nprocs ; rc == grb::SUCCESS && si > 0; --si ){
+				if( si-1 == s ){
+					for( size_t i = n_replicas - 1 ; i > 0 ; --i ){
+						const EnergyType de = ( energies[ i ] - energies[ i-1 ]) * (betas[ i ] - betas[ i-1 ]);
+
+						if( -rand( rng ) < de ){
+							std::swap( states[i], states[i-1] );
+							std::swap( energies[i], energies[i-1] );
+						}
+					}
+					grb::set( s1, states[0] );
+					msg[ 1 ].e = energies[ 0 ];
+					msg[ 1 ].b = betas[0];
+					msg[ 1 ].r = myrand;
+				}else if( si-2 == s ){
+					grb::set( s0, states[ n_replicas - 1 ] );
+					msg[ 0 ].e = energies[ n_replicas - 1 ];
+					msg[ 0 ].b = betas[ n_replicas - 1 ];
+					msg[ 0 ].r = myrand;
+				}
+				if( si == 1 ) continue;
+
+#ifdef _GRB_WITH_LPF
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].e, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].b, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 0 ].r, si-2 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].e, si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].b, si-1 );
+				rc = rc ? rc : grb::collectives<>::broadcast( msg[ 1 ].r, si-1 );
+#else
+				assert( false ); // this should never run
+#endif
+
+#ifndef NDEBUG
+	
+				if( rc != grb::SUCCESS ){
+					std::cerr << "\n\t Error in a collective broadcast " << rc << " : " <<
+						grb::toString( rc ) << std::endl;
+				}
+				assert( rc == grb::SUCCESS );
+#endif
+
+				const EnergyType de = ( msg[ 1 ].e - msg[ 0 ].e ) * ( msg[ 1 ].b - msg[ 0 ].b );
+
+				if( rc == grb::SUCCESS && ( msg[ 1 ].r < de ) ){
+#ifdef _GRB_WITH_LPF
+					rc = rc ? rc : grb::internal::broadcast( s1, si-1 );
+					rc = rc ? rc : grb::internal::broadcast( s0, si-2 );
+					assert( grb::nnz(s0) == n ); // state has to be dense!
+					assert( grb::nnz(s1) == n ); // state has to be dense!
+#else
+					assert( false ); // this should never run
+#endif
+					if( si-1 == s ){
+						rc = rc ? rc : grb::set( states[ 0 ], s0 );
+						rc = rc ? rc : grb::setElement( energies, msg[ 0 ].e, 0 );
+					}else if( si-2 ==  s ){
+						rc = rc ? rc : grb::set( states[ n_replicas - 1 ], s1 );
+						rc = rc ? rc : grb::setElement( energies, msg[ 1 ].e, n_replicas - 1 );
+					}
+				}
+			}
+			return rc;
+		}
+
+
+		/*
+		 * Estimate a solution to a given optimization problem. The solution is found
+		 * using Simulated Annealing-Replica Exchange (also known as Parallel Tempering).
+		 *
+		 * The state will be optimized to minimize the value of an energy function $U(x)$,
+		 * where $x$ is the state vector. Energies will be changed when changing the
+		 * states, so that each energy is the actual energy of the relative state.
+		 *
+		 * The parameter sweep is a user-defined function that changes a given state
+		 * (possibly randomly) and returns the variation of energy made from its
+		 * changes of the state. It should take three parameters: a state vector, the
+		 * inverse temperature (a scalar) and sweep_data.
+		 *
+		 * @param[in]     sweep      	The sweeping function.
+		 * 								Should return the energy variation relative to the changes that it
+		 * 								made on the state.
+		 * @param[in]     sweep_data    Additional data to be passed to the sweep function.
+		 * @param[in,out] states        On input: initial states.
+		 *                              On output: optimized states.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in,out] best_state	The state with the minimum energy found by the algorithm.
+		 * @param[in,out] best_energy	The minimum value of an energy found.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 *
+		 * @tparam backend		The backend used for the single objects
+		 * @tparam StateType	The state variable type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function
+		 * (e.g. a tuple of references to temporary vectors).
+		 * @tparam SweepFuncType    The type of the function.
+		 * The default value suggests the signature that the function should have.
+		 *
+		 */
+		template<
+			Backend backend,
+			typename StateType, // type of state, possibly 0/1
+			typename EnergyType,
+			typename TempType,
+			typename SweepDataType, // type of data to be passed through to the sweep function
+			typename SweepFuncType = std::function< 
+					EnergyType(
+						 grb::Vector< StateType, backend >&,
+						 const TempType&,
+						 SweepDataType&
+				 	)
+				>
+			>
+		grb::RC simulated_annealing_RE(
+				const SweepFuncType &sweep,
+				SweepDataType& sweep_data,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				grb::Vector< StateType, backend >  &best_state,
+				EnergyType &best_energy,
+				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
+				const bool &use_pt = false,
+				const size_t &seed = 42
+				){
+
+			const size_t s = spmd<>::pid();
+			const size_t n_procs = spmd<>::nprocs();
+			const size_t n_replicas = states.size();
+			const size_t n = grb::size(states[0]);
+			(void) n;
+			(void) s;
+
+			grb::RC rc = grb::SUCCESS;
+
+#ifndef NDEBUG
+			assert( n_replicas > 0 );
+			assert( n_replicas == grb::size( betas ) );
+
+			for(size_t i = 0; i < n_replicas ; ++i ){
+				assert( n == grb::size( states[ i ] ) );
+			}
+			if( s == 0 ) {
+				std::cerr << "DEBUG: Called  simulated_annealing_RE with parameters: "
+						  << "\n\t n = " << n
+						  << "\n\t n_replicas = " << n_replicas
+						  << "\n\t n_sweeps = " << n_sweeps
+						  << "\n\t goal = " << goal
+						  << "\n\t use_pt = " << use_pt
+						  << "\n\t seed = " << seed
+						  << std::endl;
+			}
+			assert( grb::size(best_state) == n );
+#endif
+
+			best_energy = std::numeric_limits< EnergyType >::max();
+
+			for( size_t i_sweep = 0 ; rc == grb::SUCCESS && i_sweep < n_sweeps ; ++i_sweep ){
+				for( size_t j = 0 ; j < n_replicas ; ++j ){
+
+					energies[j] += sweep( states[j], betas[j], sweep_data );
+					grb::wait();
+				
+					// update_best state and energy
+					if( energies[j] < best_energy ){
+						best_energy = energies[j];
+						best_state = states[j];
+					}
+				} // n_replicas
+
+				// TODO: find a better way than this, to avoid a sync at each iteration
+				rc = rc ? rc : grb::collectives<>::allreduce(
+						best_energy, grb::operators::min< EnergyType >() );
+
+				if( rc == SUCCESS && use_pt ){
+					// do a Parallel Tempering move
+					rc = pt( states, energies, betas, seed + i_sweep );
+				}
+
+#ifndef NDEBUG
+				if( s == 0 ) {
+					std::cerr << "Energy at iteration " << i_sweep << " = " << best_energy << std::endl;
+				}
+#endif
+				if( goal < -1 &&  best_energy <= goal ) i_sweep = n_sweeps;
+			} // n_sweeps
+
+#ifndef NDEBUG
+			if( rc != grb::SUCCESS ){
+				std::cerr << "ERROR at line " <<  __LINE__ << " in "
+					      << __FILE__ << ": " << grb::toString( rc ) << "\n";
+			}
+#endif
+			if( rc == SUCCESS ){
+				rc = rc ? rc : grb::collectives<>::allreduce(
+						best_energy, grb::operators::min< EnergyType >() );
+				// TODO: update best state to match best energy
+			}
+			
+			return rc;
+		}
+
+		/*
+		 * Create a set of independent masks.
+		 *
+		 * Uses a graph coloring algorithm.
+		 * Adapted from Alg. 2 of `Graph Coloring on the GPU, M. Osama, M. Truong, C. Yang, A. Buluc, J.D. Owens`.
+		 *
+		 * @param[out] masks            The vector of constructed masks.
+		 * @param[in]     A             The square (symmetric) A matrix. It should not contain any explicit zeros!
+		 * @param[in]     frontier      Temporary (dense) vector used in the function
+		 * @param[in]     w      Temporary (dense) vector used in the function
+		 * @param[in]     seed      seed for randomization
+		 *
+		 * @tparam descr	grb::Descriptor for matrix operations. Should probably not be used
+		 * @tparam backend	The backend of GraphBLAS to be used
+		 * @tparam MaskType	The state variable type.
+		 * @tparam AType	The matrix values' type.
+		 *
+		 */
+		template<
+			grb::Descriptor descr = grb::descriptors::no_operation,
+			Backend backend,
+			typename MaskType,
+			typename AType,
+			typename RSI, typename CSI, typename NZI
+		>
+		grb::RC matrix_partition(
+				std::vector< grb::Vector< MaskType, backend > > &masks,
+				const grb::Matrix< AType, backend, RSI, CSI, NZI > &A,
+				grb::Vector< AType, backend > &frontier,
+				grb::Vector< AType, backend > &w,
+				const int seed = 42
+				) {
+			masks.clear();
+			grb::RC rc = grb::SUCCESS;
+			const size_t n = grb::nrows( A );
+			const size_t s = spmd<>::pid();
+			assert( n == grb::ncols( A ) ); // A needs to be square
+			// assert( grb::is_symmetric( A ) );
+			(void) s;
+
+			grb::resize( frontier, n );
+			grb::resize( w, n );
+
+			std::minstd_rand rng ( seed );
+
+			// random shuffle w
+			for( size_t i = 0 ; i < n ; ++i ){
+				rc = rc ? rc : grb::setElement( w, i+1, i );
+			}
+			for( size_t i = 0 ; i < n ; ++i ){
+				std::uniform_int_distribution< size_t > rand ( i, n-1 );
+				const auto j = rand( rng );
+				const auto a = w[i];
+				const auto b = w[j];
+				rc = rc ? rc : grb::setElement( w, b, i );
+				rc = rc ? rc : grb::setElement( w, a, j );
+			}
+
+			const grb::Semiring<
+			grb::operators::max< AType >, grb::operators::right_assign< AType >,
+			grb::identities::negative_infinity, grb::identities::zero
+			> maxTimesRing;
+			const grb::Monoid< grb::operators::add< AType >, grb::identities::zero > addMonoid;
+			const  grb::operators::greater_than< AType > gtOp;
+			const grb::Monoid< grb::operators::right_assign< AType >, grb::identities::zero > right_assign;
+ 
+			for( size_t i = 0; rc == grb::SUCCESS && i < n ; ++i ) {
+				// find max of neighbors
+				rc = rc ? rc : grb::set< descr >( frontier, static_cast< AType >( 0 ) );
+				rc = rc ? rc : grb::mxv< descr | grb::descriptors::dense >( frontier, A, w, maxTimesRing );
+				rc = rc ? rc : grb::foldl< descr | grb::descriptors::dense >( frontier, w, gtOp );
+
+				// is there any new node?
+				AType succ = static_cast< AType >( 0 );
+				rc = rc ? rc : grb::foldl< descr >( succ, frontier, addMonoid );
+				if( succ <= 0 ){
+					break;
+				}
+
+				// add new mask
+				masks.emplace_back( grb::Vector< bool, backend >( n ) );
+				auto &new_mask = masks.at(i);
+				rc = rc ? rc : grb::resize( new_mask, n );
+				rc = rc ? rc : grb::set< descr >( new_mask, frontier, static_cast< MaskType >(true) );
+
+				// do not consider the weights of used nodes
+				rc = rc ? rc : grb::foldl< descr >( w, new_mask,
+						static_cast< AType >( 0 ), right_assign );
+			}
+			assert( rc == grb::SUCCESS );
+
+#ifndef NDEBUG
+			if( rc != grb::SUCCESS) {
+				std::cerr << "Error in matrix_partition: " << rc << " " << grb::toString(rc) << std::endl;
+
+			}
+			size_t cnt = 0;
+			if( s == 0 ) {
+				std::cerr << "Final masks: \n";
+			}
+			for(const auto&mask : masks ){
+				for( const auto &x : mask ){
+					if( x.second ){
+						if( s == 0 ) {
+							std::cerr << x.first << ", ";
+						}
+						cnt++;
+					}
+				}
+				if( s == 0 ) {
+					std::cerr << std::endl;
+				}
+			}
+			if( s == 0 ){
+				assert( cnt == n );
+			}
+#endif
+			return rc;
+		}
+	
+		/*
+		 * Estimate a solution to a given Ising problem. The solution is found
+		 * using the Simulated Annealing-Replica Exchange function above.
+		 *
+		 * The function minimized is $U(x) = x^T(\frac{1}{2}Jx + h)$, where $J$ is the supplied
+		 * couplings matrix and $h$ is the local_fields vector. The solution is searched
+		 * in the space of vectors $x$ with entries $0$ or $1$.
+		 *
+		 * states should be a vector of already initialized and filled dense grb::Vector.
+		 *
+		 * Warning: This function allocates $O(n)$ memory for temporary vectors.
+		 *
+		 * @param[in,out] states        On input: initial (dense) states.
+		 *                              On output: optimized (dense) states.
+		 * @param[in]     couplings     The square (symmetric) couplings matrix.
+		 *                              The diagonal has to be zero!
+		 * @param[in]     local_fields  The vector of local fields.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 * @param[in]     seed			Seed to use for internal randomization (must be the same for all processees);
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam QType		The matrix values' type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 * @tparam SweepDataType	Type of data to be passed on to the sweep function
+		 * (e.g. a tuple of references to temporary vectors).
+		 *
+		 */
+		template<
+			Backend backend,
+			grb::Descriptor descr = grb::descriptors::no_operation,
+			bool empty_local_fields = false,
+			typename StateType, // type of state, possibly 0/1
+			typename QType, // type of coupling matrix values
+			typename EnergyType,
+			typename TempType,
+			typename RSI, typename CSI, typename NZI,
+			class Ring = Semiring<
+				grb::operators::add< QType >, grb::operators::mul< QType >,
+				grb::identities::zero, grb::identities::one
+			>
+			>
+		grb::RC simulated_annealing_RE_Ising(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
+				const grb::Vector< QType, backend> &local_fields,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				grb::Vector< StateType, backend > &best_state,
+				EnergyType &best_energy,
+				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
+				const bool &use_pt = false,
+				const int seed = 42,
+				const Ring &ring = Ring()
+				){
+			const size_t n = grb::size( states[0] );
+			const size_t n_replicas = grb::size(betas);
+			const size_t s 		= spmd<>::pid();
+			(void) s;
+			grb::RC rc = grb::SUCCESS;
+
+			assert( grb::nnz(states[0]) == n ); // state is dense
+			assert( states.size() == n_replicas );
+			// assert( grb::is_symmetric( couplings ) );
+
+			assert( empty_local_fields || ( grb::size( local_fields ) == n ) );
+			assert( empty_local_fields || ( grb::nnz(local_fields) == n ) );
+			EnergyType energy;
+			grb::Vector< EnergyType, backend > tmp_calc_energy ( n );
+
+			const auto get_energy = [&couplings, &local_fields, &tmp_calc_energy, &ring, &n](
+					EnergyType &energy, const grb::Vector< StateType, backend > &state
+					){
+				assert( n == grb::size( state ) );
+				assert( n == grb::ncols( couplings ) );
+				assert( n == grb::nrows( couplings ) );
+				grb::RC rc = grb::SUCCESS;
+				constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+				grb::set( tmp_calc_energy, static_cast<EnergyType>( 0.0 ) );
+				rc = rc ? rc : grb::mxv< dense_descr >( tmp_calc_energy, couplings, state, ring );
+				rc = rc ? rc : grb::foldl< dense_descr >( tmp_calc_energy, static_cast< EnergyType >( 0.5 ),
+						ring.getMultiplicativeMonoid() );
+				if( !empty_local_fields) {
+					rc = rc ? rc : grb::foldl< dense_descr >( tmp_calc_energy, local_fields, ring.getAdditiveMonoid() );
+				}
+				rc = rc ? rc : grb::dot< dense_descr >( energy, tmp_calc_energy, state, ring );
+				return rc;
+			};
+
+			// it is reasonable to allow the energies to be allocated and evaluated by this function
+			if( grb::nnz(energies) == 0 ){
+				grb::resize( energies, n_replicas );
+
+				for(size_t i = 0 ; i < n_replicas ; ++i){
+					energy = static_cast< EnergyType >( 0.0 );
+					rc = rc ? rc : get_energy( energy, states[i] );
+					grb::setElement( energies, energy, i );
+				}
+			}
+
+			grb::Vector< QType, backend > h ( n );
+			grb::Vector< QType, backend > rand ( n );
+			grb::Vector< QType, backend > delta ( n );
+			grb::Vector< QType, backend > dn ( n );
+			grb::Vector< bool, backend > accept ( n );
+			std::minstd_rand rng ( seed ); // minstd_rand or std::mt19937
+
+			rc = rc ? rc : grb::resize( h, n );
+			rc = rc ? rc : grb::resize( rand, n );
+			rc = rc ? rc : grb::resize( delta, n );
+			rc = rc ? rc : grb::resize( dn, n );
+			rc = rc ? rc : grb::resize( accept, n );
+
+			std::vector< grb::Vector< bool, backend > > masks ;
+			rc = rc ? rc : matrix_partition< descr >( masks, couplings, h, rand, seed );
+			rc = rc ? rc : grb::clear(h);
+			constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+			auto sweep_data = std::tie(
+					(const decltype(couplings)&) couplings,
+					(const decltype(local_fields)&) local_fields,
+					(const decltype(masks)&) masks,
+					h,
+					rand,
+					delta,
+					dn,
+					accept,
+					rng,
+					(const decltype(ring)&) ring
+					);
+
+#ifdef NDEBUG
+            const auto ising_sweep = [](
+#else
+            const auto ising_sweep = [&get_energy](
+#endif
+				 grb::Vector< StateType, backend > &state,
+				 const TempType &beta,
+				 decltype(sweep_data) &data
+			  ){
+				const size_t s 		= spmd<>::pid();
+				(void) s;
+
+				const auto &couplings = std::get<0>(data);
+				const auto &local_fields = std::get<1>(data);
+				const auto &masks = std::get<2>(data);
+				auto &h = std::get<3>(data);
+				auto &rand = std::get<4>(data);
+				auto &delta = std::get<5>(data);
+				auto &dn = std::get<6>(data);
+				auto &accept = std::get<7>(data);
+				auto &rng = std::get<8>(data);
+				const auto &ring = std::get<9>(data);
+
+				const size_t n = grb::size( state );
+				EnergyType delta_energy = static_cast< EnergyType >(0.0);
+				grb::RC rc = grb::SUCCESS;
+
+				assert( grb::nnz(state) == n ); // state has to be dense!
+
+				if( !empty_local_fields) {
+					rc = rc ? rc : grb::set< descr >( h, local_fields );
+				}else {
+					rc = rc ? rc : grb::set< descr >( h, static_cast< QType >( 0.0 ) );
+				}
+				rc = rc ? rc : grb::mxv< dense_descr >( h, couplings, state , ring );
+
+				std::exponential_distribution< EnergyType > rand_gen ( beta );
+				for( size_t i = 0 ; i < n; ++i ){
+					const auto rnd = -rand_gen( rng );
+					rc = rc ? rc : grb::setElement( rand, rnd, i );
+				}
+
+				const grb::operators::leq< EnergyType > leq_operator;
+				const grb::operators::right_assign< EnergyType > right_assign_op;
+				const grb::operators::not_equal< EnergyType > neq_operator;
+#ifndef NDEBUG
+				const grb::Vector< StateType, backend > old_state = state;
+#endif
+				rc = rc ? rc : grb::wait();
+				for(const auto &mask : masks ){
+					// dn = (2*state_slice - 1) * h_slice
+					rc = rc ? rc : grb::set< descr >( dn, mask, state );
+					rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( dn, state, static_cast< QType >( -1 ), right_assign_op );
+					rc = rc ? rc : grb::foldl< descr >( dn, h, ring.getMultiplicativeMonoid() );
+					assert( grb::nnz( dn ) == grb::nnz( mask ) );
+#ifndef NDEBUG
+					for( const auto x : dn ){
+						assert( mask[x.first] == 1 );
+						assert( (2*int(state[x.first])-1)*h[x.first] == x.second );
+					}
+					const auto dn0 = dn;
+#endif
+
+					// Choose which changes to accept
+					// ( dn >= 0 ) | ( rand/beta < dn )
+					rc = rc ? rc : grb::foldl< descr >( dn, rand, leq_operator );
+					rc = rc ? rc : grb::set< descr >( accept, dn, mask );
+					assert( grb::nnz( accept ) <= grb::nnz( mask ) );
+#ifndef NDEBUG
+					size_t cnt = 0;
+					for( const auto x : dn0 ){
+						const size_t i = x.first;
+						assert( mask[x.first] == 1 );
+						assert( x.second );
+						if( x.second >= rand[i] ){
+							assert( dn[i] == 1 );
+							assert( accept[i] == 1 );
+							cnt++;
+						}else{
+							assert( dn[i] == 0 );
+						}
+					}
+					assert( grb::nnz( accept ) == cnt );
+#endif
+
+					// new_state = np.where(accept, 1 - old, old)
+					rc = rc ? rc : grb::foldl< descr >( state, accept, static_cast< StateType >( 1 ), neq_operator );
+					
+					// delta = new - old ==> delta[accept] = 2*new_state[accept]-1
+					rc = rc ? rc : grb::set< descr >( delta, accept, state );
+					rc = rc ? rc : grb::foldl< descr | grb::descriptors::invert_mask >( delta, delta, static_cast< QType >( -1 ), right_assign_op );
+					
+					// Update delta_energy -= dot(dn, accept)
+					rc = rc ? rc : grb::dot< descr >( delta_energy, delta, h, ring );
+
+					// update h
+					rc = rc ? rc : grb::mxv< descr >( h, couplings, delta, ring );
+				}
+				rc = rc ? rc : grb::wait();
+
+#ifndef NDEBUG
+				if( rc != grb::SUCCESS ){
+					std::cerr << "\n\t Error in some GraphBLAS function of ising_sweep " << rc << " : " << grb::toString( rc ) << std::endl;
+					abort();
+				}
+				assert( rc == grb::SUCCESS );
+				const auto new_state = state;
+				rc = rc ? rc : grb::wait();
+
+				EnergyType e1 = static_cast< EnergyType >( 0.0 ),
+						   e2 = static_cast< EnergyType >( 0.0 );
+				get_energy(e1, old_state);
+				get_energy(e2, new_state);
+				const auto real_delta = e2 - e1;
+				if( s == 0 ){
+					std::cerr << "\n\t Delta_energy: " << delta_energy;
+					std::cerr << "\n\t Real delta: " << real_delta;
+					std::cerr << "\n\t Discrepancy: " << real_delta - delta_energy;
+					std::cerr << std::endl;
+				}
+				assert( ISCLOSE(real_delta, delta_energy ) );
+#endif
+
+				return delta_energy;
+			};
+
+			return simulated_annealing_RE(
+					ising_sweep, sweep_data, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt, seed
+					);
+		}
+
+		/*
+		 * Estimate a solution to a given QUBO problem. The solution is found
+		 * using the Simulated Annealing-Replica Exchange function above.
+		 *
+		 * The function optimized is $U(x) = \frac{1}{2}x^TQx$, with the constraint that $x$ is a
+		 * 0/1 vector.
+		 *
+		 *  TODO: expand and complete documentation
+		 *
+		 * Warning: This function allocates O(n*n_replicas) memory for temporary vectors.
+		 *
+		 * @param[in,out] states        On input: initial (dense) states.
+		 *                              On output: optimized (dense) states.
+		 * @param[in]     Q             The square (symmetric) Q matrix.
+		 * @param[in,out] energies      The initial energy of each state.
+		 * @param[in,out] betas     	Inverse temperature of each state.
+		 * @param[in]     n_replicas    Number of replicas to run concurrently.
+		 * @param[in]     n_sweeps      Number of Simulated Annealing iterations.
+		 * @param[in]     use_pt		Whether to use Parallel Tampering or not.
+		 *
+		 * @tparam StateType	The state variable type.
+		 * @tparam QType		The matrix values' type.
+		 * @tparam EnergyType	The energy type.
+		 * @tparam TempType		The inverse temperature type.
+		 *
+		 */
+		template<
+			Backend backend,
+			grb::Descriptor descr = grb::descriptors::no_operation,
+			typename StateType, // type of state, possibly 0/1
+			typename QType, // type of coupling matrix values
+			typename EnergyType,
+			typename TempType,
+			typename RSI, typename CSI, typename NZI,
+				class Ring = Semiring<
+					grb::operators::add< QType >, grb::operators::mul< QType >,
+					grb::identities::zero, grb::identities::one
+				>
+			>
+		grb::RC simulated_annealing_RE_QUBO(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				grb::Vector< StateType, backend > &best_state,
+				EnergyType &best_energy,
+				const size_t &n_sweeps,
+				const EnergyType &goal = 0,
+				const bool &use_pt = false,
+				const int seed = 42,
+				const Ring &ring = Ring()
+				){
+			grb::Vector< QType > empty_local_fields ( 0 );
+
+			return simulated_annealing_RE_Ising< backend, descr, true >(
+					Q, empty_local_fields, states, energies, betas, best_state, best_energy, n_sweeps, goal, use_pt, seed, ring
+					);
+		}
+	} // namespace algorithms
+} // end namespace grb
+#undef ISCLOSE
+
+#endif // end _H_GRB_ALGORITHMS_SA-RE
+
+
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index 7e7f2af4a..cfb1d8506 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -144,6 +144,21 @@ add_grb_executables( conjugate_gradient_complex conjugate_gradient.cpp
 	COMPILE_DEFINITIONS _CG_COMPLEX
 )
 
+add_grb_executables( simulated_annealing_re_from_mpi simulated_annealing_re_from_mpi.cpp
+	BACKENDS bsp1d
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
+add_grb_executables( simulated_annealing_re_ising simulated_annealing_re_ising.cpp
+	BACKENDS reference reference_omp hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
+add_grb_executables( simulated_annealing_re_planted_sol simulated_annealing_re_planted_sol.cpp
+	BACKENDS reference reference_omp hyperdags nonblocking bsp1d
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
 add_grb_executables( gmres gmres.cpp
 	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
diff --git a/tests/smoke/simulated_annealing_re_from_mpi.cpp b/tests/smoke/simulated_annealing_re_from_mpi.cpp
new file mode 100644
index 000000000..1c84c5078
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re_from_mpi.cpp
@@ -0,0 +1,756 @@
+/*
+  Minimal scaffold adapted from ising_machine_sb.cpp to drive a replica-exchange
+  simulated-annealing (RE-SA) solver.  Algorithmic parts are intentionally left
+  unimplemented (stubs).  This file mirrors the existing IO / launcher /
+  program structure and replaces numpy arrays with grb::Vector and lists of
+  numpy vectors with std::vector< grb::Vector<...> >. Sparse matrices are
+  represented as grb::Matrix< JType >.
+
+  Purpose: allow running internal tests or an external-run mode while the RE-SA
+  algorithm is implemented separately.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <random>
+#include <cassert>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <mpi.h>
+
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
+#include <graphblas/nonzeroStorage.hpp>
+#include <graphblas/utils/timer.hpp>
+#include <graphblas/utils/parser.hpp>
+#include <graphblas/utils/singleton.hpp>
+#include <graphblas/utils/iterators/nonzeroIterator.hpp>
+#include <utils/output_verification.hpp>
+#include <graphblas.hpp>
+#include <utils/print_vec_mat.hpp>
+
+const int LPF_MPI_AUTO_INITIALIZE = 0;
+
+using namespace grb;
+
+// #define DEBUG_SARE 1
+constexpr size_t MAX_FN_SIZE = 255;
+
+// Types
+using IOType = int8_t;   // scalar/vector element type
+using JType  = float;   // coupling (matrix) value type
+using EnergyType  = double;   // energy value type
+
+template< typename T1, typename T2 >
+inline bool ISCLOSE( const T1 &a, const T2 &b, const double tol = 1e-4){
+	return (std::abs((b)-(a))/std::abs(a) < tol) || (std::abs((b)-(a)) < tol);
+}
+
+// Backend to be used inside each process
+constexpr grb::Backend internal_backend = grb::reference;
+
+/** Parser type */
+typedef grb::utils::MatrixFileReader<
+	JType,
+	std::conditional<
+		(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+		grb::config::RowIndexType,
+		grb::config::ColIndexType
+	>::type
+> Parser;
+
+/** Nonzero type */
+typedef internal::NonzeroStorage<
+	grb::config::RowIndexType,
+	grb::config::ColIndexType,
+	JType
+> NonzeroT;
+
+/** In-memory storage type using tuple */
+typedef grb::utils::Singleton<
+    std::tuple<
+        size_t,                    // n (rows/columns)
+        size_t,                    // nz (nonzeros)
+        size_t,                    // nsweeps
+        size_t,                    // n_replicas
+        bool,                      // use_pt
+        unsigned,                  // seed
+        std::vector<NonzeroT>,     // matrix data
+        std::vector<JType>         // h vector
+    >
+> Storage;
+
+namespace test_data {
+    constexpr size_t n = 16;
+    constexpr size_t nsweeps = 2;
+    constexpr size_t n_replicas = 3;
+    constexpr bool use_pt = true; 
+    constexpr unsigned seed = 8;
+
+    const std::vector< std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
+		{{0, 1}, -0.2752300610319546},
+		{{1, 0}, -0.2752300610319546},
+		{{1, 2}, -0.10636508505639508},
+		{{2, 1}, -0.10636508505639508},
+		{{2, 3}, 0.3961450048806352},
+		{{3, 2}, 0.3961450048806352},
+		{{3, 4}, -0.15453838800213293},
+		{{3, 5}, 0.4847494372852713},
+		{{4, 3}, -0.15453838800213293},
+		{{4, 5}, -0.4712679510367046},
+		{{5, 3}, 0.4847494372852713},
+		{{5, 4}, -0.4712679510367046},
+		{{5, 6}, -0.1483152637298799},
+		{{6, 5}, -0.1483152637298799},
+		{{7, 8}, -0.11904111079614699},
+		{{8, 7}, -0.11904111079614699},
+		{{9, 10}, -0.18031020353297234},
+		{{10, 9}, -0.18031020353297234},
+		{{10, 11}, -0.22985425840853468},
+		{{11, 10}, -0.22985425840853468},
+		{{11, 12}, 0.30105588632639446},
+		{{11, 13}, 0.13823880612312134},
+		{{12, 11}, 0.30105588632639446},
+		{{13, 11}, 0.13823880612312134},
+		{{13, 14}, 0.10364447636911123},
+		{{14, 13}, 0.10364447636911123},
+		{{14, 15}, 0.2955745584289766},
+		{{15, 14}, 0.2955745584289766},
+    };
+
+
+    const size_t nnz = j_matrix_data.size();
+
+    const std::vector< JType > h_array_data = {
+        -0.08910436,  0.58034508,  0.97719304,  0.16792909,
+		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
+		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
+		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
+    };
+
+}
+// --- New, minimal runner configuration and result types ---
+struct input {
+    bool use_default_data = false;
+    size_t n_replicas = test_data::n_replicas;
+    size_t nsweeps = test_data::nsweeps;
+    bool use_pt = test_data::use_pt;
+    unsigned seed = test_data::seed;
+    EnergyType reference_energy = 0.0;
+    bool verify = false;
+    char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
+    char filename_h [ MAX_FN_SIZE + 1 ];
+    char filename_ref_solution [ MAX_FN_SIZE + 1 ];
+	bool direct;
+    size_t rep = 0;
+    size_t outer = 1;
+};
+
+struct output {
+    int error_code = 0;
+    // TODO: remove itrations if not applicable
+    size_t iterations = 10; // total number of iterations performed does not make sense since the code does not have convergence criteria
+    EnergyType best_energy = std::numeric_limits< EnergyType >::max();
+	size_t rep;
+	grb::utils::TimerResults times;
+    std::unique_ptr< PinnedVector< JType, internal_backend > > pinnedSolutionVector;
+    std::unique_ptr< PinnedVector< JType, internal_backend > > pinnedRefSolutionVector;
+    // other things like eg: best replicas ...
+};
+
+template< typename Dtype >
+void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, bool direct) {
+    // Implementation for reading matrix data from file
+	try {
+		Parser parser( filename, direct );
+		assert( parser.m() == parser.n() );
+		std::get<0>(Storage::getData()) = parser.n();
+		try {
+			std::get<1>(Storage::getData()) = parser.nz();
+		} catch( ... ) {
+			std::get<1>(Storage::getData()) = parser.entries();
+		}
+		/* Once internal issue #342 is resolved this can be re-enabled
+		for(
+			auto it = parser.begin( PARALLEL );
+			it != parser.end( PARALLEL );
+			++it
+		) {
+			data.push_back( *it );
+		}*/
+		for(
+			auto it = parser.begin( SEQUENTIAL );
+			it != parser.end( SEQUENTIAL );
+			++it
+		) {
+			data.push_back( Dtype( *it ) );
+#ifdef DEBUG_SARE
+			if( spmd<>::pid() == 0 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "readmatrix_data: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+}
+
+template< typename NonzeroT, typename IType, typename VType >
+void read_matrix_data_from_array(
+	const std::vector<std::pair< std::pair< IType, IType >, VType > > &array,
+	std::vector<NonzeroT> &data
+) {
+	// Implementation for reading matrix data from array
+    try {
+        for (const auto &entry : array) {
+            data.emplace_back(
+                NonzeroT( entry.first.first, entry.first.second, entry.second )
+            );
+#ifdef DEBUG_SARE
+			if( spmd<>::pid() < 2 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+        }
+        std::get<0>(Storage::getData()) = test_data::n;
+        std::get<1>(Storage::getData()) = data.size();
+    } catch (const std::exception &e) {
+        std::cerr << "Failed to read matrix data from array: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data(const std::string &filename, std::vector<Dtype> &data) {
+    // Implementation for reading vector data from file
+    try {
+        std::ifstream file( filename );
+        if( !file.is_open() ) {
+            std::cerr << "Failed to open vector file: " << filename << "\n";
+            return;
+        }
+        std::string line;
+        while( std::getline( file, line ) ) {
+            if( line.empty() ) continue; // skip empty lines
+            std::istringstream iss( line );
+            Dtype v;
+            if( !(iss >> v) ) {
+                throw std::runtime_error( "Failed to parse line in vector file" );
+            }
+            data.push_back( v );
+        }
+    } catch( std::exception &e ) {
+        std::cerr << "I/O program failed: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data_from_array(
+	const std::vector<Dtype> &array, std::vector<Dtype> &data
+) {
+	// Implementation for reading vector data from array
+	try {
+		for (size_t i = 0; i < array.size(); ++i) {
+			data.push_back(array[i]);
+		}
+	} catch (const std::exception &e) {
+		std::cerr << "Failed to read vector data from array: " << e.what() << "\n";
+		return;
+	}
+}
+
+template<
+	Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = Semiring<
+		grb::operators::add< JType >, grb::operators::mul< JType >,
+		grb::identities::zero, grb::identities::one
+	>,
+	typename Ttmp
+	>
+EnergyType get_energy(
+				 const grb::Matrix< JType, backend >& couplings,
+				 const grb::Vector< JType, backend > &local_fields,
+				 const grb::Vector< IOType,backend > &state,
+				 grb::Vector< Ttmp, backend > &tmp,
+				 const Ring &ring = Ring()
+			  ){
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::resize( tmp, n );
+	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+void ioProgram( const struct input &data_in, bool &success ) {
+
+    using namespace test_data;
+	success = false;
+
+	const size_t s = spmd<>::pid();
+	(void) s;
+	assert( s < spmd<>::nprocs() );
+
+	try {
+		// Parse and store matrix in singleton class
+		// Map Storage tuple fields to meaningful names and wire up default data
+		auto &storage = Storage::getData();
+		auto &n           = std::get<0>(storage); // n (rows/cols)
+		auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+		auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+		auto &n_replicas_st = std::get<3>(storage); // n_replicas
+		auto &use_pt      = std::get<4>(storage); // use_pt
+		auto &seed_st     = std::get<5>(storage); // seed
+		auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+		auto &h           = std::get<7>(storage); // std::vector<JType>
+
+		// Initialize metadata from input (allow CLI to override defaults)
+		(void) n;
+		(void) nnz;
+		nsweeps_st    = data_in.nsweeps;
+		n_replicas_st = data_in.n_replicas;
+		use_pt        = data_in.use_pt;
+		seed_st       = data_in.seed;
+
+
+		if ( data_in.use_default_data ) {
+			// if no file provided, use default data from file_content
+			read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
+			read_vector_data_from_array<JType>( test_data::h_array_data, h );
+			// other data
+		} else {
+			// read from files if provided
+			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
+			if( std::strlen( data_in.filename_h ) > 0 ) {
+				read_vector_data<JType>( data_in.filename_h, h );
+			}else{
+				h.resize( n );
+				std::fill( h.begin(), h.end(), static_cast< JType >( 0 ) );
+			}
+			if( data_in.verify ) {
+				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
+					std::cerr << "Reference solution file not provided for verification\n";
+					return;
+				}
+			}
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+
+	success = true;
+}
+
+
+void grbProgram(
+    const struct input &data_in, 
+    struct output &out
+) {
+    std::cout<< "grbProgram: running simulated-annealing RE solver (stub)\n";
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	const size_t nprocs = spmd<>::nprocs();
+	(void) nprocs;
+
+    grb::utils::Timer timer;
+	timer.reset();
+
+    /* --- Problem setup --- */
+    const size_t n = std::get<0>(Storage::getData());
+	(void) n;
+	if( s == 0 ){
+		std::cout << "problem size n = " << n << "\n";
+	}
+    grb::Vector< JType, internal_backend > h( n );
+
+    // populate J with test (random) values
+    grb::RC rc = grb::SUCCESS;
+
+    // load into GraphBLAS
+    grb::Matrix< JType, internal_backend > J( n, n );
+	{
+		const auto &data = std::get<6>(Storage::getData());
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			PARALLEL
+		);*/
+		io_rc = io_rc ? io_rc : wait();
+		if( io_rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( io_rc ) << ")." << std::endl;
+			out.error_code = 5;
+			return;
+		}
+
+#ifdef DEBUG_SARE
+		if( s == 0 && grb::ncols( J ) < 40 ) {
+			std::cout << "Matrix J:\n";
+			print_matrix( J );
+		}
+#endif
+	}
+
+    // build vector h with data from singleton
+    {
+        const auto &h_data = std::get<7>(Storage::getData());
+		rc = rc ? rc : buildVector(
+			h,
+			h_data.cbegin(),
+			h_data.cend(),
+			SEQUENTIAL
+		);
+    }
+
+    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
+
+    // create states storage and initialize with random 1/0 values
+    const size_t n_replicas = data_in.n_replicas;
+    std::vector< grb::Vector< IOType, internal_backend > > states0;
+    std::vector< grb::Vector< IOType, internal_backend > > states;
+    std::vector< IOType > rand_data (n);
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        // initialize with random values
+        std::uniform_int_distribution< IOType > randint(0,1);
+        // we use buildvectorUnique with a random set of indices
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data[i] = randint( rng );
+        }
+
+        states.emplace_back( n );
+        states0.emplace_back( n );
+        rc = rc ? rc : grb::buildVector(
+            states0.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            SEQUENTIAL
+        );
+		rc = rc ? rc : grb::set( states.back(), states0.back() );
+    }
+
+    // also make betas vector of size n_replicas and initialize with a geometric gradient
+    grb::Vector< JType, internal_backend > betas( n_replicas );
+    grb::Vector< EnergyType, internal_backend > energies( n_replicas );
+    grb::Vector< EnergyType, internal_backend > energies0( n_replicas );
+    grb::Vector< EnergyType, internal_backend > tmp_energy( n );
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
+		rc = rc ? rc : grb::setElement( betas, static_cast< JType >( ( 1.0 ) * std::pow< JType >( 1.5, ( n_replicas * s + r ) ) ), r );
+        rc = rc ? rc : grb::setElement( energies0, get_energy(  J, h, states[r], tmp_energy ), r );
+    }
+	rc = rc ? rc : grb::set( energies, energies0 );
+
+    #ifdef DEBUG_SARE
+    if( s == 0 ) {
+        for ( size_t r = 0; r < n_replicas; ++r ) {
+            std::cout << "Process " << s << ": ";
+            std::cout << "Initial state replica " << r << ":\n";
+            print_vector( states[r], 30 ,"states values" );  
+			std::cout << "With energy " << energies0[r] << "\n";
+            std::cout << std::endl;
+        }
+    }
+    #endif
+	grb::Vector< IOType, internal_backend > best_state ( n );
+    rc = rc ? rc : wait();
+
+	out.rep = data_in.rep;
+	// time a single call
+	if( out.rep == 0 ) {
+		timer.reset();
+		rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
+        );
+
+		rc = rc ? rc : wait();
+		double single_time = timer.time();
+		if( !(rc == SUCCESS || rc == FAILED) ) {
+			std::cerr << "Failure: call to Simulated Annealing RE did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == FAILED ) {
+			std::cout << "Warning: call to Simulated Annealing RE did not converge\n";
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS || rc == FAILED ) {
+			if( s == 0 ) {
+				if( rc == FAILED ) {
+					std::cout << "Info: cold Simulated Annealing RE did not converge within ";
+				} else {
+					std::cout << "Info: cold Simulated Annealing RE completed within ";
+				}
+				std::cout << out.iterations << " iterations. "
+					<< "Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		for( size_t i = 0; i < 2 ; ++i ){
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			rc = rc ? rc : grb::set( energies, energies0 );
+
+			rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
+			);
+		}
+		// do benchmark
+		double min_time = 1e9;
+		double max_time = 0;
+		double total_time = 0;
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
+			rc = rc ? rc : grb::set( energies, energies0 );
+			timer.reset();
+			if( rc == SUCCESS ) {
+				out.iterations = data_in.nsweeps;
+
+				rc = grb::algorithms::simulated_annealing_RE_Ising(
+					J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
+				);
+				grb::collectives<>::allreduce( out.best_energy, grb::operators::min< EnergyType >() );
+			}
+			const double time_taken = timer.time();
+			min_time = std::min(min_time, time_taken);
+			max_time = std::max(max_time, time_taken);
+			total_time +=  time_taken;
+			if(s == 0){
+				std::cerr << n_replicas << "," << data_in.nsweeps << "," << time_taken << "," << out.best_energy << std::endl;
+			}
+		}
+
+		out.times.useful = total_time / static_cast< double >( out.rep );
+		// print timing at root process
+		if( s == 0 ) {
+			std::cout << "Average Time taken for " << out.rep << " "
+				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << grb::toString( rc ) << std::endl;
+			std::cout << "\tnumber of IM-SB iterations: " << out.rep << "\n"; std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) ) << "\n";;
+			std::cout << "\tMin Time: " << min_time << "\n";
+			std::cout << "\tMax Time: " << max_time << "\n";
+		}
+		sleep( 1 );
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+}
+
+
+// --- Simple help / CLI parser for the new runner (no backward compatibility) ---
+void printhelp( char *progname ) {
+    std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT]\n"
+              << "       [--rep INT] [--goal INT] [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "Options:\n"
+              << "  --use-default-data         Use embedded default test data\n"
+              << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated), if not provided assume zero\n"
+              << "  --n-replicas INT           Number of replicas (default: 3)\n"
+              << "  --nsweeps INT              Number of sweeps (default: 2)\n"
+              << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
+              << "  --seed INT                 RNG seed (default: 8)\n"
+              << "  --rep INT                  Number of times to repeat the run of the algorithm (default: 1)\n"
+              << "  --goal FLOAT               The value of the energy to achieve before stopping (default: 0, no such check).\n"
+              << "  --verify                   Verify output against reference solution\n"
+              << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
+              << "  --help, -h                 Print this help message\n";
+}
+
+bool parse_arguments( input &in, int argc, char ** argv ) {
+	std::fill( in.filename_Jmatrix, in.filename_Jmatrix + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_h, in.filename_h + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_ref_solution, in.filename_ref_solution + MAX_FN_SIZE, '\0' );
+    in.direct = true;
+    // map benchmarking configuration to the runner's fields
+    in.rep = grb::config::BENCHMARKING::inner();
+    in.outer = grb::config::BENCHMARKING::outer();
+    in.reference_energy = static_cast<EnergyType>( 0.0 );
+    // keep verify default (false) unless overridden via CLI
+    in.verify = false;
+
+    for ( int i = 1; i < argc; ++i ) {
+        std::string a = argv[i];
+        if ( a == "--use-default-data" ) {
+            in.use_default_data = true;
+        } else if ( a == "--j-matrix-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
+			std::strncpy( in.filename_Jmatrix, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--h-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
+			std::strncpy( in.filename_h, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--n-replicas" ) {
+            if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
+            in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--nsweeps" ) {
+            if ( i+1 >= argc ) { std::cerr << "--nsweeps requires an argument\n"; return false; }
+            in.nsweeps = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--use-pt" ) {
+            if ( i+1 >= argc ) { std::cerr << "--use-pt requires an argument\n"; return false; }
+            in.use_pt = static_cast<bool>( std::stoul(argv[++i]) );
+        } else if ( a == "--seed" ) {
+            if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
+            in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--rep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
+            in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--goal" ) {
+            if ( i+1 >= argc ) { std::cerr << "--goal requires an argument\n"; return false; }
+            in.reference_energy = std::stof(argv[++i]);
+        } else if ( a == "--verify" ) {
+            in.verify = true;
+        } else if ( a == "--ref-solution-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
+			std::strncpy( in.filename_ref_solution, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--help" || a == "-h" ) {
+            printhelp( argv[0] );
+            return false;
+        } else {
+            std::cerr << "Unknown argument: " << a << "\n";
+            return false;
+        }
+    }
+
+    // basic validation
+    if ( !in.use_default_data ) {
+        if ( std::strlen( in.filename_Jmatrix ) == 0 ) {
+            std::cerr << "Either --use-default-data or --j-matrix-fname must be provided\n";
+            return false;
+        }
+    }
+    if ( in.verify && !in.use_default_data
+			&& std::strlen( in.filename_ref_solution ) == 0 ) {
+        std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
+        return false;
+    }
+    return true;
+}
+
+// --- Minimal main that uses the existing ioProgram / grbProgram entrypoints ---
+int main( int argc, char ** argv ) {
+    std::cout << "simulated_anealing_re runner\n";
+    input in;
+    output out;
+
+	// init MPI
+	if( MPI_Init( &argc, &argv ) != MPI_SUCCESS ) {
+		std::cerr << "MPI_Init returns with non-SUCCESS exit code." << std::endl;
+		return 10;
+	}
+
+    if ( !parse_arguments( in, argc, argv ) ) {
+        printhelp( argv[0] );
+        return 1;
+    }
+
+
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=ising_sweep_spmd" << "\n";
+
+    // Run IO program (populates Storage or similar)
+    {
+        bool success = false;
+		grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD );
+        grb::RC rc = launcher.exec( &ioProgram, in, success, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "I/O launcher failed: " << toString(rc) << "\n";
+            return 2;
+        }
+        if ( !success ) {
+            std::cerr << "I/O program reported failure\n";
+            return 3;
+        }
+    }
+
+    // Run main GraphBLAS program that builds data and calls reSA stub
+    {
+		grb::Launcher< FROM_MPI > launcher( MPI_COMM_WORLD );
+        grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+            return 4;
+        }
+    }
+
+	int s;
+	if( MPI_Comm_rank(MPI_COMM_WORLD, &s) != MPI_SUCCESS ) {
+		std::cerr << "MPI_Comm_rank returns with non-SUCCESS exit code." << std::endl;
+		return 51;
+	}
+	if( s == 0 ){
+		std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.rep << " best_energy=" << out.best_energy << "\n";
+	}
+	
+	// finalise MPI
+	if( MPI_Finalize() != MPI_SUCCESS ) {
+		std::cerr << "MPI_Finalize returns with non-SUCCESS exit code." << std::endl;
+		return 50;
+	}
+
+    return out.error_code;
+}
diff --git a/tests/smoke/simulated_annealing_re_ising.cpp b/tests/smoke/simulated_annealing_re_ising.cpp
new file mode 100644
index 000000000..e3f7794d6
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re_ising.cpp
@@ -0,0 +1,739 @@
+/*
+  Minimal scaffold adapted from ising_machine_sb.cpp to drive a replica-exchange
+  simulated-annealing (RE-SA) solver.  Algorithmic parts are intentionally left
+  unimplemented (stubs).  This file mirrors the existing IO / launcher /
+  program structure and replaces numpy arrays with grb::Vector and lists of
+  numpy vectors with std::vector< grb::Vector<...> >. Sparse matrices are
+  represented as grb::Matrix< JType >.
+
+  Purpose: allow running internal tests or an external-run mode while the RE-SA
+  algorithm is implemented separately.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <tuple>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <random>
+#include <cassert>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
+#include <graphblas/nonzeroStorage.hpp>
+#include <graphblas/utils/timer.hpp>
+#include <graphblas/utils/parser.hpp>
+#include <graphblas/utils/singleton.hpp>
+#include <graphblas/utils/iterators/nonzeroIterator.hpp>
+#include <utils/output_verification.hpp>
+#include <graphblas.hpp>
+#include <utils/print_vec_mat.hpp>
+
+using namespace grb;
+
+// #define DEBUG_SARE 1
+constexpr size_t MAX_FN_SIZE = 255;
+
+// Types
+using IOType = int8_t;   // scalar/vector element type
+using JType  = float;   // coupling (matrix) value type
+using EnergyType  = double;   // energy value type
+
+template< typename T1, typename T2 >
+inline bool ISCLOSE( const T1 &a, const T2 &b, const double tol = 1e-4){
+	return (std::abs((b)-(a))/std::abs(a) < tol) || (std::abs((b)-(a)) < tol);
+}
+
+/** Parser type */
+typedef grb::utils::MatrixFileReader<
+	JType,
+	std::conditional<
+		(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+		grb::config::RowIndexType,
+		grb::config::ColIndexType
+	>::type
+> Parser;
+
+/** Nonzero type */
+typedef internal::NonzeroStorage<
+	grb::config::RowIndexType,
+	grb::config::ColIndexType,
+	JType
+> NonzeroT;
+
+/** In-memory storage type using tuple */
+typedef grb::utils::Singleton<
+    std::tuple<
+        size_t,                    // n (rows/columns)
+        size_t,                    // nz (nonzeros)
+        size_t,                    // nsweeps
+        size_t,                    // n_replicas
+        bool,                      // use_pt
+        unsigned,                  // seed
+        std::vector<NonzeroT>,     // matrix data
+        std::vector<JType>         // h vector
+    >
+> Storage;
+
+namespace test_data {
+    constexpr size_t n = 16;
+    constexpr size_t nsweeps = 2;
+    constexpr size_t n_replicas = 3;
+    constexpr bool use_pt = true; 
+    constexpr unsigned seed = 8;
+
+    const std::vector<  std::pair< std::pair< grb::config::RowIndexType, grb::config::ColIndexType >, JType > > j_matrix_data = {
+		{{0, 1}, -0.2752300610319546},
+		{{1, 0}, -0.2752300610319546},
+		{{1, 2}, -0.10636508505639508},
+		{{2, 1}, -0.10636508505639508},
+		{{2, 3}, 0.3961450048806352},
+		{{3, 2}, 0.3961450048806352},
+		{{3, 4}, -0.15453838800213293},
+		{{3, 5}, 0.4847494372852713},
+		{{4, 3}, -0.15453838800213293},
+		{{4, 5}, -0.4712679510367046},
+		{{5, 3}, 0.4847494372852713},
+		{{5, 4}, -0.4712679510367046},
+		{{5, 6}, -0.1483152637298799},
+		{{6, 5}, -0.1483152637298799},
+		{{7, 8}, -0.11904111079614699},
+		{{8, 7}, -0.11904111079614699},
+		{{9, 10}, -0.18031020353297234},
+		{{10, 9}, -0.18031020353297234},
+		{{10, 11}, -0.22985425840853468},
+		{{11, 10}, -0.22985425840853468},
+		{{11, 12}, 0.30105588632639446},
+		{{11, 13}, 0.13823880612312134},
+		{{12, 11}, 0.30105588632639446},
+		{{13, 11}, 0.13823880612312134},
+		{{13, 14}, 0.10364447636911123},
+		{{14, 13}, 0.10364447636911123},
+		{{14, 15}, 0.2955745584289766},
+		{{15, 14}, 0.2955745584289766},
+    };
+
+    const size_t nnz = j_matrix_data.size();
+
+    const std::vector< JType > h_array_data = {
+        -0.08910436,  0.58034508,  0.97719304,  0.16792909,
+		-0.9221754 , -0.10715418, -0.62365497,  0.25411129,
+		-0.5693644 , -0.69805978,  0.07228861, -0.79922641,
+		0.46231686 , 0.87930208 ,  0.88663637, -0.25052299,
+    };
+}
+// --- New, minimal runner configuration and result types ---
+struct input {
+    size_t n = test_data::n;
+    size_t n_replicas = test_data::n_replicas;
+    size_t nsweeps = test_data::nsweeps;
+    bool use_pt = test_data::use_pt;
+    unsigned seed = test_data::seed;
+    bool use_default_data = false;
+    char filename_Jmatrix [ MAX_FN_SIZE + 1 ];
+    char filename_h [ MAX_FN_SIZE + 1 ];
+    EnergyType reference_energy = 0.0;
+    bool verify = false;
+    char filename_ref_solution [ MAX_FN_SIZE + 1 ];
+	bool direct;
+    size_t rep = 0;
+    size_t outer = 1;
+};
+
+struct output {
+    int error_code = 0;
+    // TODO: remove itrations if not applicable
+    size_t iterations = 10; // total number of iterations performed does not make sense since the code does not have convergence criteria
+    EnergyType best_energy = std::numeric_limits< EnergyType >::max();
+	size_t rep;
+	grb::utils::TimerResults times;
+    std::unique_ptr< PinnedVector< JType > > pinnedSolutionVector;
+    std::unique_ptr< PinnedVector< JType > > pinnedRefSolutionVector;
+    // other things like eg: best replicas ...
+};
+
+template< typename Dtype >
+void read_matrix_data(const std::string &filename, std::vector<Dtype> &data, bool direct) {
+    // Implementation for reading matrix data from file
+	try {
+		Parser parser( filename, direct );
+		assert( parser.m() == parser.n() );
+		std::get<0>(Storage::getData()) = parser.n();
+		try {
+			std::get<1>(Storage::getData()) = parser.nz();
+		} catch( ... ) {
+			std::get<1>(Storage::getData()) = parser.entries();
+		}
+		/* Once internal issue #342 is resolved this can be re-enabled
+		for(
+			auto it = parser.begin( PARALLEL );
+			it != parser.end( PARALLEL );
+			++it
+		) {
+			data.push_back( *it );
+		}*/
+		for(
+			auto it = parser.begin( SEQUENTIAL );
+			it != parser.end( SEQUENTIAL );
+			++it
+		) {
+			data.push_back( Dtype( *it ) );
+#ifdef DEBUG_SARE
+			if( spmd<>::pid() == 0 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "readmatrix_data: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+}
+
+template< typename NonzeroT, typename IType, typename VType >
+void read_matrix_data_from_array(
+	const std::vector<std::pair< std::pair< IType, IType >, VType > > &array,
+	std::vector<NonzeroT> &data
+) {
+	// Implementation for reading matrix data from array
+    try {
+        for (const auto &entry : array) {
+            data.emplace_back(
+                NonzeroT( entry.first.first, entry.first.second, entry.second )
+            );
+#ifdef DEBUG_SARE
+			if( spmd<>::pid() < 1 ){
+				// print last data element from std::vector<NonzeroT> data
+				std::cout << "read_matrix_data_from_array: " << data.back().first.first << ", "
+					<< data.back().first.second << ", " << data.back().second << "\n";
+			}
+#endif
+        }
+        std::get<0>(Storage::getData()) = test_data::n;
+        std::get<1>(Storage::getData()) = data.size();
+    } catch (const std::exception &e) {
+        std::cerr << "Failed to read matrix data from array: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data(const std::string &filename, std::vector<Dtype> &data) {
+    // Implementation for reading vector data from file
+    try {
+        std::ifstream file( filename );
+        if( !file.is_open() ) {
+            std::cerr << "Failed to open vector file: " << filename << "\n";
+            return;
+        }
+        std::string line;
+        while( std::getline( file, line ) ) {
+            if( line.empty() ) continue; // skip empty lines
+            std::istringstream iss( line );
+            Dtype v;
+            if( !(iss >> v) ) {
+                throw std::runtime_error( "Failed to parse line in vector file" );
+            }
+            data.push_back( v );
+        }
+    } catch( std::exception &e ) {
+        std::cerr << "I/O program failed: " << e.what() << "\n";
+        return;
+    }
+}
+
+template< typename Dtype >
+void read_vector_data_from_array(
+	const std::vector<Dtype> &array, std::vector<Dtype> &data
+) {
+	// Implementation for reading vector data from array
+	try {
+		for (size_t i = 0; i < array.size(); ++i) {
+			data.push_back(array[i]);
+		}
+	} catch (const std::exception &e) {
+		std::cerr << "Failed to read vector data from array: " << e.what() << "\n";
+		return;
+	}
+}
+
+template<
+	Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = Semiring<
+		grb::operators::add< JType >, grb::operators::mul< JType >,
+		grb::identities::zero, grb::identities::one
+	>,
+	typename Ttmp
+	>
+EnergyType get_energy(
+				 const grb::Matrix< JType, backend >& couplings,
+				 const grb::Vector< JType, backend > &local_fields,
+				 const grb::Vector< IOType, backend > &state,
+				 grb::Vector< Ttmp, backend > &tmp,
+				 const Ring &ring = Ring()
+			  ){
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::resize( tmp, n );
+	grb::RC rc = grb::SUCCESS;
+	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< JType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+void ioProgram( const struct input &data_in, bool &success ) {
+	success = false;
+
+	// Parse and store matrix in singleton class
+	// Map Storage tuple fields to meaningful names and wire up default data
+	auto &storage = Storage::getData();
+	auto &n           = std::get<0>(storage); // n (rows/cols)
+	auto &nnz         = std::get<1>(storage); // nz (nonzeros)
+	auto &nsweeps_st  = std::get<2>(storage); // nsweeps
+	auto &n_replicas_st = std::get<3>(storage); // n_replicas
+	auto &use_pt      = std::get<4>(storage); // use_pt
+	auto &seed_st     = std::get<5>(storage); // seed
+	auto &Jdata       = std::get<6>(storage); // std::vector<NonzeroT>
+	auto &h           = std::get<7>(storage); // std::vector<JType>
+
+	try {
+		// Initialize metadata from input (allow CLI to override defaults)
+		(void) n; // initialized by read_matrix_*
+		(void) nnz; // initialized by read_matrix_*
+		nsweeps_st    = data_in.nsweeps;
+		n_replicas_st = data_in.n_replicas;
+		use_pt        = data_in.use_pt;
+		seed_st       = data_in.seed;
+
+		if ( data_in.use_default_data ) {
+			// if no file provided, use default data from file_content
+			read_matrix_data_from_array<NonzeroT>( test_data::j_matrix_data, Jdata );
+			read_vector_data_from_array<JType>( test_data::h_array_data, h );
+			// other data
+		} else {
+			// read from files if provided
+			read_matrix_data<NonzeroT>( data_in.filename_Jmatrix, Jdata, data_in.direct );
+			if( std::strlen( data_in.filename_h ) > 0 ) {
+				read_vector_data<JType>( data_in.filename_h, h );
+			}else{
+				h.resize( n );
+				std::fill( h.begin(), h.end(), static_cast< JType >( 0 ) );
+			}
+			if(data_in.verify) {
+				if( std::strlen(data_in.filename_ref_solution) == 0 ) {
+					std::cerr << "Reference solution file not provided for verification\n";
+					return;
+				}
+			}
+		}
+	} catch( std::exception &e ) {
+		std::cerr << "I/O program failed: " << e.what() << "\n";
+		return;
+	}
+
+	success = true;
+}
+
+void grbProgram(
+    const struct input &data_in, 
+    struct output &out
+) {
+    std::cout<< "grbProgram: running simulated-annealing RE solver (stub)\n";
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+    grb::utils::Timer timer;
+	timer.reset();
+
+    /* --- Problem setup --- */
+    const size_t n = std::get<0>(Storage::getData());
+    const size_t n_replicas = std::get<3>(Storage::getData());
+	if( s == 0 ){
+		std::cout << "problem size n = " << n << "\n";
+	}
+    grb::Vector< JType > h( n );
+
+    // populate J with test (random) values
+    grb::RC rc = grb::SUCCESS;
+
+    // load into GraphBLAS
+    grb::Matrix< JType > J( n, n );
+	{
+		const auto &data = std::get<6>(Storage::getData());
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		RC io_rc = buildMatrixUnique(
+			J,
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cbegin() ),
+			utils::makeNonzeroIterator<
+				grb::config::RowIndexType, grb::config::ColIndexType, JType
+			>( data.cend() ),
+			PARALLEL
+		);*/
+		io_rc = io_rc ? io_rc : wait();
+		if( io_rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( io_rc ) << ")." << std::endl;
+			out.error_code = 5;
+			return;
+		}
+		// make J symmetric
+		// grb::Matrix< JType > Jt ( n, n );
+		// Jt = J;
+		// const grb::Monoid< grb::operators::add< JType >, grb::identities::zero > addMonoid;
+		// const grb::Monoid< grb::operators::mul< JType >, grb::identities::one > mulMonoid;
+		// grb::foldl< grb::descriptors::transpose_right >( J, Jt, addMonoid); // issue  #210
+		// grb::foldl<>( J, static_cast< JType >( 0.5 ), mulMonoid);
+
+#ifdef DEBUG_SARE
+		if( s == 0 && grb::ncols( J ) < 40 ) {
+			std::cout << "Matrix J:\n";
+			print_matrix( J );
+		}
+#endif
+	}
+
+    // build vector h with data from singleton
+    {
+        const auto &h_data = std::get<7>(Storage::getData());
+		rc = rc ? rc : buildVector(
+			h,
+			h_data.cbegin(),
+			h_data.cend(),
+			SEQUENTIAL
+		);
+    }
+
+    // seed RNGs (C and C++ engines) using requested seed (hardcoded default 8 if not provided)
+    std::minstd_rand rng ( data_in.seed + s ); // rng or std::mt19937
+
+    // create states storage and initialize with random 1/0 values
+    std::vector< grb::Vector<IOType> > states0;
+    std::vector< grb::Vector<IOType> > states;
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        states0.emplace_back( grb::Vector<IOType>(n) );
+        states.emplace_back( grb::Vector<IOType>(n) );
+        // initialize with random values
+        std::uniform_int_distribution< unsigned short > randint(0,1);
+        // we use buildvectorUnique with a random set of indices
+        std::vector< IOType > rand_data;
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data.emplace_back( static_cast<IOType>(
+                randint( rng ) ) );
+        }
+        rc = rc ? rc : grb::buildVector(
+            states0.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            SEQUENTIAL
+        );
+		rc = rc ? rc : grb::set( states.back(), states0.back() );
+    }
+
+	grb::Vector< EnergyType > tmp_energy ( n );
+	EnergyType initial_energy = get_energy(  J, h, states[0], tmp_energy );
+
+	for ( size_t r = 0; r < n_replicas; ++r ) {
+		const auto en = get_energy(  J, h, states[r], tmp_energy );
+		initial_energy = std::min( en, initial_energy );
+    #ifdef DEBUG_SARE
+		if( s == 0 ) {
+			std::cout << "Initial state replica " << r << ":\n";
+			print_vector( states[r], 30 ,"states values" );
+			std::cout << "With energy " << en << "\n";
+			std::cout << std::endl;
+		}
+    #endif
+	}
+
+    // also make betas vector os size n_replicas and initialize with 10.0
+    grb::Vector< JType > betas( n_replicas );
+    grb::Vector< EnergyType > energies( n_replicas );
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
+        rc = rc ? rc : grb::setElement( betas, static_cast< JType >( (10.0) * std::pow<JType>( 2, r ) ), r );
+        // rc = rc ? rc : grb::setElement( energies, get_energy(  J, h, states[r], tmp_energy ), r );
+    }
+	assert( rc == grb::SUCCESS );
+
+	grb::Vector< IOType > best_state ( n );
+
+	out.rep = data_in.rep;
+	// time a single call
+	if( out.rep == 0 ) {
+		timer.reset();
+		rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed
+        );
+
+		rc = rc ? rc : wait();
+		double single_time = timer.time();
+		if( !(rc == SUCCESS || rc == FAILED) ) {
+			std::cerr << "Failure: call to Simulated Annealing RE did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == FAILED ) {
+			std::cout << "Warning: call to Simulated Annealing RE did not converge\n";
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS || rc == FAILED ) {
+			if( s == 0 ) {
+				if( rc == FAILED ) {
+					std::cout << "Info: cold Simulated Annealing RE did not converge within ";
+				} else {
+					std::cout << "Info: cold Simulated Annealing RE completed within ";
+				}
+				std::cout << out.iterations << " iterations. "
+					<< "Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		for( size_t i = 0; i < 2 ; ++i ){
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
+			rc = rc ? rc : grb::clear( energies );
+
+			rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
+			);
+
+			assert( ISCLOSE( get_energy(  J, h, best_state, tmp_energy ), out.best_energy) );
+		}
+		// do benchmark
+		double min_time = 1e9;
+		double max_time = 0;
+		double total_time = 0;
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			for ( size_t r = 0; r < n_replicas; ++r ) {
+				rc = rc ? rc : grb::set(states[r], states0[r]);
+			}
+			out.best_energy = std::numeric_limits< EnergyType >::max();
+			rc = rc ? rc : grb::clear( energies );
+			timer.reset();
+			if( rc == SUCCESS ) {
+				out.iterations = data_in.nsweeps;
+
+                rc = grb::algorithms::simulated_annealing_RE_Ising(
+			 J, h, states, energies, betas, best_state, out.best_energy, data_in.nsweeps, data_in.reference_energy, data_in.use_pt, data_in.seed + i
+                );
+			}
+			if( grb::Properties<>::isNonblockingExecution ) {
+				rc = rc ? rc : wait();
+			}
+			const double time_taken = timer.time();
+
+			assert( ISCLOSE( get_energy(  J, h, best_state, tmp_energy ), out.best_energy) );
+			min_time = std::min(min_time, time_taken);
+			max_time = std::max(max_time, time_taken);
+			total_time +=  time_taken;
+			std::cerr << n_replicas << "," << data_in.nsweeps << "," << time_taken << "," << out.best_energy << std::endl;
+		}
+
+		out.times.useful = total_time / static_cast< double >( out.rep );
+		// print timing at root process
+		if( s == 0 ) {
+			std::cout << "Average Time taken for " << out.rep << " "
+				<< "Simulated Annealing RE calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << grb::toString( rc ) << std::endl;
+			std::cout << "\tnumber of IM-SB iterations: " << out.rep << "\n"; std::cout << "\tmilliseconds per iteration: "
+				<< ( out.times.useful / static_cast< double >( out.iterations ) ) << "\n";;
+			std::cout << "\tMin Time: " << min_time << "\n";
+			std::cout << "\tMax Time: " << max_time << "\n";
+
+			if( data_in.verify ){
+				if( out.best_energy < initial_energy ){
+					std::cout << "Test OK" << std::endl;
+				}else{
+					std::cout << "Test FAILED" << std::endl;
+				}
+			}
+		}
+		sleep( 1 );
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+}
+
+
+// --- Simple help / CLI parser for the new runner (no backward compatibility) ---
+void printhelp( char *progname ) {
+    std::cout << "Usage: " << progname << " [--use-default-data] [--j-matrix-fname STR] [--h-fname STR]\n"
+              << "       [--n-replicas INT] [--nsweeps INT] [--seed INT]\n"
+              << "       [--rep INT] [--goal INT] [--verify] [--ref-solution-fname STR] [--help]\n\n"
+              << "Options:\n"
+              << "  --use-default-data         Use embedded default test data\n"
+              << "  --j-matrix-fname STR       Path to J matrix file (matrix-market or supported)\n"
+              << "  --h-fname STR              Path to h (local fields) vector (whitespace separated), if not provided assume zero\n"
+              << "  --n-replicas INT           Number of replicas (default: 3)\n"
+              << "  --nsweeps INT              Number of sweeps (default: 2)\n"
+              << "  --use-pt BOOL              Use Parallel Tampering (default: 1)\n"
+              << "  --seed INT                 RNG seed (default: 8)\n"
+              << "  --rep INT                  number of times to repeat the run of the algorithm (default: 1)\n"
+              << "  --goal FLOAT               The value of the energy to achieve before stopping (default: 0, no such check).\n"
+              << "  --verify                   Verify output against reference solution\n"
+              << "  --ref-solution-fname STR   Reference solution file (required with --verify unless using default data)\n"
+              << "  --help, -h                 Print this help message\n";
+}
+
+bool parse_arguments( input &in, int argc, char ** argv ) {
+	std::fill( in.filename_Jmatrix, in.filename_Jmatrix + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_h, in.filename_h + MAX_FN_SIZE, '\0' );
+	std::fill( in.filename_ref_solution, in.filename_ref_solution + MAX_FN_SIZE, '\0' );
+    in.direct = true;
+    // map benchmarking configuration to the runner's fields
+    in.rep = grb::config::BENCHMARKING::inner();
+    in.outer = grb::config::BENCHMARKING::outer();
+    in.reference_energy = static_cast<EnergyType>( 0.0 );
+    // keep verify default (false) unless overridden via CLI
+    in.verify = false;
+
+    for ( int i = 1; i < argc; ++i ) {
+        std::string a = argv[i];
+        if ( a == "--use-default-data" ) {
+            in.use_default_data = true;
+        } else if ( a == "--j-matrix-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--j-matrix-fname requires an argument\n"; return false; }
+            std::strncpy( in.filename_Jmatrix, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--h-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--h-fname requires an argument\n"; return false; }
+			std::strncpy( in.filename_h, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--n-replicas" ) {
+            if ( i+1 >= argc ) { std::cerr << "--n-replicas requires an argument\n"; return false; }
+            in.n_replicas = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--nsweeps" ) {
+            if ( i+1 >= argc ) { std::cerr << "--nsweeps requires an argument\n"; return false; }
+            in.nsweeps = static_cast<size_t>( std::stoul(argv[++i]) );
+        } else if ( a == "--use-pt" ) {
+            if ( i+1 >= argc ) { std::cerr << "--use-pt requires an argument\n"; return false; }
+            in.use_pt = static_cast<bool>( std::stoul(argv[++i]) );
+        } else if ( a == "--rep" ) {
+            if ( i+1 >= argc ) { std::cerr << "--rep requires an argument\n"; return false; }
+            in.rep = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--seed" ) {
+            if ( i+1 >= argc ) { std::cerr << "--seed requires an argument\n"; return false; }
+            in.seed = static_cast<unsigned>( std::stoul(argv[++i]) );
+        } else if ( a == "--goal" ) {
+            if ( i+1 >= argc ) { std::cerr << "--goal requires an argument\n"; return false; }
+            in.reference_energy = std::stof(argv[++i]);
+        } else if ( a == "--verify" ) {
+            in.verify = true;
+        } else if ( a == "--ref-solution-fname" ) {
+            if ( i+1 >= argc ) { std::cerr << "--ref-solution-fname requires an argument\n"; return false; }
+			std::strncpy( in.filename_ref_solution, argv[++i], MAX_FN_SIZE );
+        } else if ( a == "--help" || a == "-h" ) {
+            printhelp( argv[0] );
+            return false;
+        } else {
+            std::cerr << "Unknown argument: " << a << "\n";
+            return false;
+        }
+    }
+
+    // basic validation
+    if ( !in.use_default_data ) {
+        if ( std::strlen( in.filename_Jmatrix ) == 0 ) {
+            std::cerr << "Either --use-default-data or both --j-matrix-fname must be provided\n";
+            return false;
+        }
+    }
+    if ( in.verify && !in.use_default_data
+			&& std::strlen( in.filename_ref_solution ) == 0 ) {
+        std::cerr << "--ref-solution-fname required when --verify is used without --use-default-data\n";
+        return false;
+    }
+    return true;
+}
+
+// --- Minimal main that uses the existing ioProgram / grbProgram entrypoints ---
+int main( int argc, char ** argv ) {
+    std::cout << "simulated_anealing_re runner\n";
+    input in;
+    output out;
+
+    if ( !parse_arguments( in, argc, argv ) ) {
+        printhelp( argv[0] );
+        return 1;
+    }
+
+
+    std::cout << "seed=" << in.seed << " n_replicas=" << in.n_replicas << " nsweeps=" << in.nsweeps << " sweep=ising_sweep" << "\n";
+
+    // Run IO program (populates Storage or similar)
+    {
+        bool success = false;
+        grb::Launcher< AUTOMATIC > launcher;
+        grb::RC rc = launcher.exec( &ioProgram, in, success, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "I/O launcher failed: " << toString(rc) << "\n";
+            return 2;
+        }
+        if ( !success ) {
+            std::cerr << "I/O program reported failure\n";
+            return 3;
+        }
+    }
+
+    // Run main GraphBLAS program that builds data and calls reSA stub
+    {
+        grb::Launcher< AUTOMATIC > launcher;
+        grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+        if ( rc != SUCCESS ) {
+            std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+            return 4;
+        }
+    }
+
+    std::cout << "Finished: error_code=" << out.error_code << " iterations=" << out.rep << " best_energy=" << out.best_energy << "\n";
+    return out.error_code;
+}
diff --git a/tests/smoke/simulated_annealing_re_planted_sol.cpp b/tests/smoke/simulated_annealing_re_planted_sol.cpp
new file mode 100644
index 000000000..f6210806f
--- /dev/null
+++ b/tests/smoke/simulated_annealing_re_planted_sol.cpp
@@ -0,0 +1,249 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <cmath>
+#include <cassert>
+
+#include <graphblas/algorithms/simulated_annealing_re.hpp>
+#include <graphblas.hpp>
+
+using QType = float;
+using StateType = int8_t;
+using EnergyType = double;
+
+template< grb::Backend backend >
+void generate_sparse_planted_qubo(
+    int n,
+    int degree,
+    std::pair< QType, QType > weight_range,
+    grb::Vector< QType, backend >  &Q_diag,
+    grb::Matrix< QType, backend > &Q_off,
+    grb::Vector< StateType, backend > &x_star,
+    double &E_star,
+    unsigned int seed = 0
+) {
+	std::minstd_rand rng( seed );
+	std::uniform_int_distribution< StateType > int_dist(0, 1);
+	std::uniform_real_distribution< QType > weight_dist(weight_range.first, weight_range.second);
+
+	std::vector< StateType > x ( n );
+	for( auto  &y : x ){
+		y = int_dist( rng );
+	}
+	grb::resize( x_star, n );
+	grb::buildVector( x_star, x.begin(), x.end(), grb::SEQUENTIAL );
+
+	grb::clear( Q_diag );
+	grb::clear( Q_off );
+	E_star = 0.0;
+
+	std::map< std::pair<size_t,size_t>, QType > Q;
+	std::vector< QType > Qdiag ( n, 0 );
+
+    for (size_t i = 0; i < n; ++i) {
+		std::vector< size_t > neighbors;
+        for (size_t j = 0; j < n; ++j) {
+            if (j != i) neighbors.push_back(j);
+        }
+		std::shuffle( neighbors.begin(), neighbors.end(), rng );
+        neighbors.resize( degree );
+
+        for (const auto&j : neighbors) {
+            if (j < i) continue;
+
+            const double w = weight_dist( rng );
+            const int b = x_star[i] ^ x_star[j];
+
+            if (b == 0) {
+                Qdiag[i] += w;
+                Qdiag[j] += w;
+                Q[{i, j}] -= 2*w;
+                Q[{j, i}] -= 2*w;
+            } else {
+                Qdiag[i] -= w;
+                Qdiag[j] -= w;
+                Q[{i, j}] += 2*w;
+                Q[{j, i}] += 2*w;
+                E_star += w;
+            }
+        }
+    }
+	std::vector< size_t > i, j;
+	std::vector< QType > v;
+	for(const auto &x : Q ){
+		i.push_back( x.first.first );
+		j.push_back( x.first.second );
+		v.push_back( x.second );
+	}
+
+	grb::buildVector( Q_diag, Qdiag.begin(), Qdiag.end(), grb::SEQUENTIAL );
+	grb::buildMatrixUnique( Q_off,
+			i.begin(), i.end(),
+			j.begin(), j.end(),
+			v.begin(), v.end(),
+			grb::SEQUENTIAL );
+}
+
+template<
+	grb::Backend backend,
+	grb::Descriptor descr = grb::descriptors::no_operation,
+	class Ring = grb::Semiring<
+		grb::operators::add< QType >, grb::operators::mul< QType >,
+		grb::identities::zero, grb::identities::one
+	>,
+	typename Ttmp
+	>
+EnergyType get_energy(
+				 const grb::Matrix< QType, backend >& couplings,
+				 const grb::Vector< QType, backend > &local_fields,
+				 const grb::Vector< StateType,backend > &state,
+				 grb::Vector< Ttmp, backend > &tmp,
+				 const Ring &ring = Ring()
+			  ){
+	const size_t n = grb::size( local_fields );
+	assert( n == grb::size( state ) );
+	assert( n == grb::ncols( couplings ) );
+	assert( n == grb::nrows( couplings ) );
+	grb::RC rc = grb::SUCCESS;
+	rc = rc ? rc : grb::resize( tmp, n );
+	EnergyType energy = 0.0;
+	constexpr auto dense_descr = descr | grb::descriptors::dense;
+
+	rc = rc ? rc : grb::set< descr >( tmp, 0.0 );
+	rc = rc ? rc : grb::mxv< dense_descr >( tmp, couplings, state, ring );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, static_cast< QType >( 0.5 ), ring.getMultiplicativeMonoid() );
+	rc = rc ? rc : grb::foldl< dense_descr >( tmp, local_fields, ring.getAdditiveMonoid() );
+	rc = rc ? rc : grb::dot< dense_descr >( energy, tmp, state, ring );
+	assert( rc == grb::SUCCESS );
+
+	return energy;
+}
+
+template< grb::Backend backend >
+bool brute_force_check(
+    const grb::Vector< QType, backend > &Q_diag,
+    const grb::Matrix< QType, backend > &Q_off,
+    const grb::Vector< StateType, backend > &x_star,
+    double E_star
+) {
+    const int n = grb::size( x_star );
+    EnergyType min_energy = 1e7;
+	std::vector< grb::Vector< StateType > > argmins;
+	assert( n < 8 * sizeof( int64_t ) );
+
+	grb::Vector< double, backend > tmp ( n );
+	grb::Vector< StateType, backend > x ( n );
+    for (int64_t bits = 0; bits < (1 << n); ++bits) {
+        for (int i = 0; i < n; ++i) {
+			grb::setElement( x, (bits >> i) & 1, i);
+        }
+        double E = get_energy( Q_off, Q_diag, x, tmp );
+
+        if (E < min_energy - 1e-9) {
+            min_energy = E;
+            argmins = {x};
+        } else if (abs(E - min_energy) < 1e-9) {
+            argmins.push_back(x);
+        }
+    }
+
+	std::cout << "Planted energy   : " << -E_star << std::endl;
+	std::cout << "Minimum found    : " << min_energy << std::endl;
+	std::cout << "# ground states  : " << argmins.size() << std::endl;
+
+    bool planted_ok = false;
+    for (const auto &x : argmins) {
+        if ( std::equal(x.begin(), x.end(), x_star.begin()) ) {
+            planted_ok = true;
+            break;
+        }
+    }
+
+	std::cout << std::boolalpha;
+	std::cout << "planted_is_optimal: " << planted_ok << std::endl;
+    std::cout << "energy_matches: " << (std::abs(min_energy + E_star) < 1e-9) << std::endl;
+    std::cout << "degeneracy " << argmins.size() << std::endl;
+	return planted_ok;
+}
+
+void grbProgram( const size_t&n, grb::RC &rc ) {
+	rc = grb::SUCCESS;
+    const int degree = 4;
+    const std::pair< QType, QType > weight_range = {1.0, 1.0};
+    const unsigned int seed = 1;
+
+    grb::Vector< QType > Q_diag ( n );
+    grb::Matrix< QType > Q_off ( n, n );
+	grb::Vector< StateType > x_star ( n );
+    double E_star = 0.0;
+
+    generate_sparse_planted_qubo( n, degree, weight_range, Q_diag, Q_off, x_star, E_star, seed );
+
+    const bool optimal = brute_force_check(Q_diag, Q_off, x_star, E_star);
+	// assert( optimal );
+
+	std::cout << "------------------ Test with SA-RE ----------------------" << std::endl;
+	grb::Vector< StateType > best_state ( n );
+	EnergyType best_energy = 42;
+	constexpr bool use_pt = true;
+	constexpr EnergyType reference_energy = 0;
+	constexpr size_t nsweeps = 100;
+	constexpr size_t n_replicas = 16;
+	const size_t s = grb::spmd<>::pid();
+
+    std::minstd_rand rng ( seed + s ); // rng or std::mt19937
+
+    // create states storage and initialize with random 1/0 values
+    std::vector< grb::Vector< StateType > > states;
+    for ( size_t r = 0; r < n_replicas; ++r ) {
+        std::uniform_int_distribution< StateType > randint(0,1);
+        std::vector< StateType > rand_data;
+        for ( size_t i = 0; i < n; ++i ) {
+            rand_data.emplace_back( static_cast< StateType >(
+                randint( rng ) ) );
+        }
+        states.emplace_back( n );
+        rc = rc ? rc : grb::buildVector(
+            states.back(),
+            rand_data.cbegin(),
+            rand_data.cend(),
+            grb::SEQUENTIAL
+        );
+    }
+
+    // also make betas vector of size n_replicas and initialize with 10.0
+    grb::Vector< QType > betas( n_replicas );
+    grb::Vector< EnergyType > energies( n_replicas );
+    grb::Vector< EnergyType > tmp_energy( n );
+    for ( size_t r = 0; rc == grb::SUCCESS && r < n_replicas; ++r ) {
+        rc = rc ? rc : grb::setElement( betas, static_cast< QType >( (10.0) * std::pow<QType>( 2, r ) ), r );
+        rc = rc ? rc : grb::setElement( energies, get_energy( Q_off, Q_diag, states[r], tmp_energy ), r );
+    }
+	assert( rc == grb::SUCCESS );
+
+	rc = grb::algorithms::simulated_annealing_RE_Ising(
+		 Q_off, Q_diag, states, energies, betas, best_state, best_energy, nsweeps, reference_energy, use_pt, seed
+	);
+	assert( get_energy( Q_off, Q_diag, best_state, tmp_energy ) == best_energy );
+	std::cout << "Optimized SA-RE value: " << best_energy << std::endl;
+
+	if( best_energy != -E_star ){
+		rc = grb::FAILED;
+	}
+}
+
+int main() {
+	const size_t in = 18;
+	grb::RC out;
+
+	grb::Launcher< grb::AUTOMATIC > launcher;
+	grb::RC rc = launcher.exec( &grbProgram, in, out, true );
+	if ( rc != grb::SUCCESS ) {
+		std::cerr << "grbProgram launcher failed: " << toString(rc) << "\n";
+		return 4;
+	}
+	std::cout << "Test " << (( out == grb::SUCCESS )? "OK" : "FAILED") << std::endl;
+    return 0;
+
+}
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index ce6c3ce6b..3e08561da 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -436,6 +436,19 @@ for BACKEND in ${BACKENDS[@]}; do
 				grep "Test OK" ${TEST_OUT_DIR}/fuselets.log || echo "Test FAILED"
 				echo " "
 			fi
+
+			if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "reference"  ] || [ "$BACKEND" = "hyperdags" ] || [ "$BACKEND" = "nonblocking"  ]; then
+				echo ">>>      [x]           [ ]       Testing Simulated Annealing-Replica Exchange on a"
+				echo "                                 small 16x16 matrix."
+				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND}"
+				$runner ${TEST_BIN_DIR}/simulated_annealing_re_ising_${BACKEND} --use-default-data --verify &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
+				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
+				echo ">>>      [x]           [ ]       Testing Simulated Annealing-Replica Exchange on a"
+				echo "                                 18x18 matrix with constructed optimal solution."
+				echo "Functional test executable: ${TEST_BIN_DIR}/simulated_annealing_re_planted_sol_${BACKEND}"
+				$runner ${TEST_BIN_DIR}/simulated_annealing_re_planted_sol_${BACKEND} &> ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log
+				( grep "Test OK" ${TEST_OUT_DIR}/simulated_annealing_re_ising_${BACKEND}_${P}_${T}.log ) || printf 'Test FAILED.\n'
+			fi
 		done
 	done
 
diff --git a/tests/smoke/test_qubo_parallel.py b/tests/smoke/test_qubo_parallel.py
new file mode 100644
index 000000000..b81ad862d
--- /dev/null
+++ b/tests/smoke/test_qubo_parallel.py
@@ -0,0 +1,664 @@
+"""
+Copyright © 2023, United States Government, as represented by the Administrator
+of the National Aeronautics and Space Administration. All rights reserved.
+
+The PySA, a powerful tool for solving optimization problems is licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0.
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+"""
+
+from more_itertools import distribute
+from itertools import repeat
+from multiprocessing import Pool
+from os import cpu_count
+
+import numpy as np
+from typing import List, Tuple, Any, Callable, Optional, NoReturn
+
+
+Vector = List[float]
+# Matrix = List[List[float]] instead for List[List[float]] we will use sparse matrix from scipy
+from scipy.sparse import csr_matrix
+# Define general type
+dtype = 'float'
+# set random seed
+random_seed = 8
+
+Vector = List[float]
+Matrix = csr_matrix
+State = List[float]
+RefProbFun = Callable[[Vector, Optional[int]], float]
+EnergyFunction = Callable[[Matrix, Vector, State], float]
+UpdateSpinFunRef = Callable[
+    [Matrix, Vector, Vector, int, float, float, RefProbFun], float]
+UpdateSpinFunction = Callable[[Matrix, Vector, Vector, int, float, float],
+                              float]
+SweepFunction = Callable[[UpdateSpinFunction, Matrix, Vector, Vector, float],
+                         float]
+
+def partition_rows_by_independent_sets(couplings: csr_matrix, *, method: str = "welsh_powell"):
+    """
+    Partition the rows of a sparse symmetric matrix into independent sets (color classes)
+    so that within each set no two rows share a non-zero coupling (i.e. they are
+    pairwise non-adjacent in the row-interaction graph).
+
+    This implements a greedy graph-coloring (Welsh-Powell) style algorithm on the
+    graph implied by the sparsity pattern of `couplings`. The intent is to produce
+    row blocks that can be updated in parallel (or in any order) without violating
+    sequential Metropolis dependencies.
+
+    Parameters
+    - couplings: csr_matrix (shape [n, n]) sparse symmetric coupling matrix (only sparsity matters)
+    - method: currently only "welsh_powell" supported
+
+    Returns
+    - blocks: list of lists, each inner list is a color class containing row indices
+    - colors: numpy array of length n with integer color for each row
+
+    Complexity: O(n log n + m) where m is number of nonzeros (greedy ordering cost)
+    """
+
+    if not isinstance(couplings, csr_matrix):
+        couplings = csr_matrix(couplings)
+
+    n = couplings.shape[0]
+
+    # Build adjacency lists from the sparsity pattern (ignore diagonal)
+    indptr = couplings.indptr
+    indices = couplings.indices
+
+    degrees = np.diff(indptr)
+
+    # Order vertices by decreasing degree (Welsh-Powell)
+    order = np.argsort(-degrees)
+
+    colors = -1 * np.ones(n, dtype=int)
+    blocks = []
+
+    for v in order:
+        if colors[v] != -1:
+            continue
+
+        # Try to place v into the first existing color where it's independent
+        placed = False
+        for c_idx, block in enumerate(blocks):
+            # Check independence: v must not be adjacent to any node in block
+            # We can test by scanning neighbors of v and see if any have colors == c_idx
+            neigh = indices[indptr[v]:indptr[v+1]]
+            if not np.intersect1d(neigh, np.array(block)).size:
+                block.append(int(v))
+                colors[v] = c_idx
+                placed = True
+                break
+
+        if not placed:
+            # Create new color
+            colors[v] = len(blocks)
+            blocks.append([int(v)])
+
+    return blocks, colors
+
+
+def compute_row_batches(couplings: csr_matrix, *, order: str = "color", return_row_blocks: bool = False):
+    """
+    Compute a sequential list of batches (sizes) that cover all rows exactly once.
+
+    The simplest use is to partition rows by independent sets using
+    `partition_rows_by_independent_sets` and then produce a list of batch sizes
+    (and optionally the row indices per batch) that should be executed
+    sequentially. This is useful for the lazy-accumulator sweep which wants
+    to update entire independent blocks before flushing or proceeding.
+
+    Parameters
+    - couplings: csr_matrix
+    - order: currently only 'color' supported (return blocks in color order)
+    - return_row_blocks: if True return (batches, row_blocks) else return batches
+
+    Returns
+    - batches: list[int] of batch sizes whose sum equals n
+    - row_blocks (optional): list[list[int]] of row indices per batch
+    """
+
+    blocks, colors = partition_rows_by_independent_sets(couplings)
+
+    # Optionally reorder blocks; for now 'color' is the natural order
+    row_blocks = blocks
+    batches = [len(b) for b in row_blocks]
+
+    if return_row_blocks:
+        return batches, row_blocks
+    return batches
+
+
+
+def masked_row_dot(couplings: Matrix, D: np.ndarray, pos: int, bs: int) -> float:
+    """
+    Compute the contribution of the accumulated deltas `D` to the local field
+    at row `pos`, i.e. return (J[row=pos] dot D). This models a masked-mxv
+    primitive that computes only the scalar contribution required for the
+    Metropolis decision at `pos` without materializing the whole h += J.dot(D).
+
+    In Python we use the sparse row matvec; in C++ this should map to a
+    fast masked-mxv primitive or a pipeline read for the single scalar.
+    """
+    # For CSR matrices getrow returns a 1xN sparse matrix; dot(D) returns a 1-elem array
+    # we use todense to simulate a fast masked-mxv primitive
+    return (couplings.todense()[pos:pos+bs, :].dot(D))
+
+
+def get_energy(couplings: Matrix, local_fields: Vector, state: State) -> float:
+    """
+    Compute energy given couplings and local fields.
+    """
+    # Ensure state is 0/1
+    assert np.all((state == 0) | (state == 1)), "State must contain only 0 or 1 values."
+    # Ensure shapes are compatible
+    assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+    assert couplings.shape[0] == state.shape[0], "State and couplings must have compatible shapes."
+    assert local_fields.shape[0] == state.shape[0], "Local fields and state must have compatible shapes."
+    return state.dot(couplings.dot(state) / 2 + local_fields)
+
+
+def sequential_sweep_x(couplings: Matrix, local_fields: Vector,
+                     state: State, beta: float) -> float:
+    """
+    Metropolis sweep that preserves sequential Metropolis semantics but avoids
+    expensive per-row dense conversions by precomputing the local field
+    h = J.dot(state) + local_fields and updating h incrementally when a spin
+    flip is accepted. Works with CSR sparse `couplings`.
+    """
+
+    n = len(state)
+
+    # Precompute local fields h = J.dot(state) + local_fields. In a lazy
+    # evaluation framework you'd typically start a pipeline here that
+    # represents h but we materialize once for correctness in the Python
+    # prototype.
+    h = couplings.dot(state) + local_fields
+
+    # Random numbers (log uniform)
+    log_r = np.log(np.random.random(size=n))
+
+    delta_energy = 0.0
+
+    # D is the lazy accumulator vector: it holds pending 0/1 deltas that we
+    # haven't yet materialized into h. The lazy evaluator in your framework
+    # would instead record these into a pipeline and only execute when a
+    # reduction-to-scalar (dot, min, sum) requires it.
+    D = np.zeros(n, dtype=h.dtype)
+
+    # iterate sequentially (Metropolis semantics). We allow D to accumulate
+    # multiple nonzeros. For each pos we compute the scalar contribution from
+    # D to h[pos] via a masked row dot (row J[pos,:] dot D) so we don't have
+    # to apply the whole matvec until we need to.
+
+    # Use precomputed batches/row_blocks attached to the matrix when available
+    # (compute_row_batches attaches them to the matrix as _row_blocks/_batches)
+    if hasattr(couplings, '_row_blocks') and hasattr(couplings, '_batches'):
+        row_blocks = getattr(couplings, '_row_blocks')
+        batches = getattr(couplings, '_batches')
+    else:
+        # Compute batches (row blocks) from the matrix sparsity pattern. Each
+        # block is an independent set and can be updated without internal
+        # dependencies. compute_row_batches returns sizes; request row blocks too.
+        batches, row_blocks = compute_row_batches(couplings, return_row_blocks=True)
+
+    # Iterate sequentially over blocks. Inside each block rows are independent
+    # so they can be processed in any order (or in parallel) while preserving
+    # Metropolis semantics between blocks.
+    for block_idx, rows in enumerate(row_blocks):
+        # test_states for debugging
+        test_states = state.copy()
+
+        # If there are pending deltas compute their effect on these rows only
+        # using a masked row dot over the block. masked_row_dot expects a
+        # starting pos and block size; we call it for each row in the block.
+        #row_contrib = np.asarray([masked_row_dot(couplings, D, r, 1).item() for r in rows])
+        # rewritten as a dense matrix vector product for the block to simulate efficent code
+        print("rows:",rows)
+        row_contrib = couplings[rows, :].dot(D)
+
+        # compute delta energies for the block
+        state_slice = np.array(state)[rows]
+        h_slice = np.array(h)[rows]
+        dn = (2.0 * state_slice - 1.0) * (h_slice + row_contrib)
+
+        # Vectorized Metropolis decision for rows in this block
+        accept = (dn >= 0) | (log_r[rows] < beta * dn)
+        old = np.array(state)[rows]
+        new = np.where(accept, 1 - old, old)
+        delta_energy += -np.sum(dn * accept)
+        # Update state and D in place
+        for idx, r in enumerate(rows):
+            state[r] = new[idx]
+            D[r] += (new[idx] - old[idx])
+
+
+    # Flush any remaining accumulated deltas into h before returning. This
+    # materializes the lazy pipeline; the framework could do this lazily at
+    # a later synchronization point instead.
+    # if np.any(D):
+    #     apply_accumulated(couplings, D, h)
+
+    return float(delta_energy)
+
+
+def sequential_sweep_immediate(couplings: Matrix, local_fields: Vector,
+                               state: State, beta: float, printinfo: bool = False) -> float:
+    """
+    Immediate-update Metropolis sweep: on each accepted flip we update the
+    local-field vector `h` immediately by iterating the nonzeros of the
+    flipped row (neighbor updates). This is the standard efficient approach
+    for CSR matrices and is provided here for performance comparison against
+    the lazy `D`-accumulator approach.
+
+    For simplicity in this prototype, we convert the sparse matrix to dense internally.
+    """
+    n = len(state)
+    # Convert couplings to dense numpy array
+    dense_couplings = couplings.toarray()
+    h = dense_couplings.dot(state) + local_fields
+    log_r = np.log(np.random.random(size=n))
+    delta_energy = 0.0
+
+    # Use same batching mechanism as sequential_sweep_x to improve locality.
+    if hasattr(couplings, '_row_blocks') and hasattr(couplings, '_batches'):
+        row_blocks = getattr(couplings, '_row_blocks')
+        batches = getattr(couplings, '_batches')
+    else:
+        batches, row_blocks = compute_row_batches(couplings, return_row_blocks=True)
+
+    for block_idx, rows in enumerate(row_blocks):
+        # process rows in this independent block (vectorized)
+        if printinfo:
+            print("rows = ", rows)
+        rows = np.array(rows)
+        hi = h[rows]
+        si = state[rows]
+        dn = (2.0 * si - 1.0) * hi
+
+        # Vectorized Metropolis decision
+        accept = (dn >= 0) | (log_r[rows] < beta * dn)
+        old = si
+        new = np.where(accept, 1 - old, old)
+        delta = new - old
+
+        # Update state and delta_energy
+        state[rows] = new
+        delta_energy += -np.sum(dn * accept)
+
+        # Update h for all spins (dense update)
+        if np.any(delta):
+            h += dense_couplings[:, rows].dot(delta)
+
+    return float(delta_energy)
+
+
+
+def pt(states: List[State], energies: List[float], beta_idx: List[int],
+       betas: List[float]) -> NoReturn:
+    """
+  Parallel tempering move.
+    states: [n_replicas, ...]  Array of replicas
+    energies: [n_replicas] Array of energies of each replica
+    beta_idx: [n_replicas] The replica index currently assigned to each beta,
+        i.e. inverse temperature K is currently used for simulating replica beta_idx[K]
+    betas: [n_replicas] Sequential array of inverse temperatures.
+
+    This function only modifies the order of beta_idx.
+  """
+    print("pt(in):",energies[beta_idx], "  beta(in):", betas[beta_idx])
+
+    # Get number of replicas
+    n_replicas = len(states)
+
+    # Apply PT for each pair of replicas
+    for k in range(n_replicas - 1):
+
+        # Get first index
+        k1 = n_replicas - k - 1
+
+        # Get second index
+        k2 = n_replicas - k - 2
+
+        # Compute delta energy
+        de = (energies[beta_idx[k1]] - energies[beta_idx[k2]]) * (betas[k1] - betas[k2])
+
+        # Accept/reject following Metropolis
+        if de >= 0 or np.random.random() < np.exp(de):
+            beta_idx[k1], beta_idx[k2] = beta_idx[k2], beta_idx[k1]
+
+    print("pt(out):",energies[beta_idx], "  beta(out):", betas[beta_idx])
+
+
+
+def simulation_parallel_x(
+                        sweep: SweepFunction,
+                        couplings: Matrix,
+                        local_fields: Vector,
+                        states: List[State],
+                        energies: List[float],
+                        beta_idx: List[int],
+                        betas: List[float],
+                        n_sweeps: int,
+                        get_part_fun: bool = False,
+                        use_pt: bool = True) -> Tuple[State, float, int, int]:
+    """
+  Apply simulation.
+  """
+
+    # Get number of replicas
+    n_replicas = len(states)
+
+    # Best configuration/energy
+    _best_energy = np.copy(energies)
+    _best_state = np.copy(states)
+    _best_sweeps = np.zeros(n_replicas, dtype=np.int32)
+    betas_sorted = np.empty_like(betas)
+    log_omegas = np.zeros(n_sweeps)
+
+    # For each run ...
+    for s in range(n_sweeps):
+        for k in range(n_replicas):
+            betas_sorted[beta_idx[k]] = betas[k]
+        # ... apply sweep for each replica ...
+        # interate k in random order to avoid bias
+        #print("betas_sorted: ",betas_sorted)
+        perm = np.random.permutation(n_replicas)
+        print("perm: ",perm)
+        for k in perm:  # numba.prange(n_replicas):
+
+            # Apply sweep
+            tmp = sweep(couplings, local_fields,states[k], betas_sorted[k])
+            energies[k] += tmp
+            print("Replica ",k," energy=",energies[k])
+
+            # Store best state
+            if energies[k] < _best_energy[k]:
+                _best_energy[k] = energies[k]
+                _best_state[k] = np.copy(states[k])
+                _best_sweeps[k] = s
+
+        # ... and pt move.
+        #print("beta_idx after sweep: ",beta_idx)
+        if use_pt:
+            pt(states, energies, beta_idx, betas)
+        #print("beta_idx after pt:    ",beta_idx)
+        # Calculate the weights for the partition function
+
+    # Get lowest energy
+    best_pos = np.argmin(_best_energy)
+    best_state = _best_state[best_pos]
+    best_energy = _best_energy[best_pos]
+    best_sweeps = _best_sweeps[best_pos]
+
+    # Return states and energies
+    return ((states, energies, beta_idx, log_omegas), (best_state, best_energy,
+                                                       best_sweeps, s + 1))
+
+
+
+def get_min_energy(couplings: Matrix, local_fields: Vector):
+
+    # Get number of variables
+    n_vars = couplings.shape[0]
+    # assert square matrix
+    assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+    # assert compatible shapes
+    assert local_fields.shape[0] == n_vars, "Local fields must have the same number of variables as couplings."
+    min_energy = np.inf
+    best_state = np.array([0]*n_vars, dtype=dtype)
+
+    # Find minimum energy by bruteforce
+    for state in range(2**n_vars):
+        # Transform state
+        spin_state_unsigned = np.array([int(x) for x in bin(state)[2:].zfill(n_vars)], dtype=dtype)
+        # Get energy for the state
+        assert np.all(np.isin(spin_state_unsigned, [0, 1])), "State contains values other than 0 and 1"
+        energy = get_energy(couplings, local_fields, spin_state_unsigned)
+        # Store only the minimum energy
+        if energy < min_energy:
+            min_energy = energy
+            best_state = np.copy(spin_state_unsigned)
+
+    return min_energy, best_state
+
+
+def gen_random_problem(n_vars: int,
+                       dtype: Any = 'float', nzratio = 0.1, test_dense: bool = False, printinfo: bool = False) -> Tuple[Matrix, Vector]:
+
+    # Generate random problem
+    if (test_dense):
+        couplings = 2 * np.random.random((n_vars, n_vars)).astype(dtype) - 1
+        couplings = (couplings + couplings.T) / 2
+        vals = couplings.flatten()
+        row = np.array([i for i in range(n_vars) for j in range(n_vars)])
+        col = np.array([j for i in range(n_vars) for j in range(n_vars)])
+    else:
+        # couplings are random sparse matrix instead of dense with nz none zero elements
+        nz = int(nzratio * n_vars * n_vars)
+        row = np.sort(np.random.randint(0, n_vars, nz))
+        col = np.sort(np.random.randint(0, n_vars, nz))
+        # make sure there are no duplicate entries ins same i,j pairs in (row,col)
+        unique = np.unique(np.array([row, col]).T)
+        row = row[unique]
+        col = col[unique]
+        vals = 2 * np.random.random(len(unique)).astype(dtype) - 1
+
+    couplings = csr_matrix((vals, (row, col)), shape=(n_vars, n_vars))
+    couplings = (couplings + couplings.T) / 2
+    diag_couplings = couplings.diagonal()
+    #set diagonal to zero
+    couplings = couplings - csr_matrix((diag_couplings, (np.arange(n_vars), np.arange(n_vars))), shape=(n_vars, n_vars))
+
+    if printinfo:
+        # print sparse matrix structure
+        # ie
+        # 0 0 0 1 0
+        # 0 0 1 0 0
+        # 0 1 0 0 0
+        # print actual value for the unit test
+        print("Couplings matrix (indices and nonzero values in COO):")
+        print("Row indices:", couplings.nonzero()[0])
+        print("Column indices:", couplings.nonzero()[1])
+        print("Nonzero values:", couplings.data)
+
+        print("Couplings matrix structure (*=nonzero, .=zero):")
+        dense_couplings = couplings.toarray()
+        for i in range(n_vars):
+            row_str = ""
+            for j in range(n_vars):
+                if dense_couplings[i,j] != 0:
+                    row_str += "* "
+                else:
+                    row_str += ". "
+            print("[{}]".format(i),"\t",row_str)
+
+    # Split in couplings and local_fields
+    #local_fields = np.copy(np.diagonal(couplings))
+    #make local_fields random instead of from diagonal
+    local_fields = 2 * np.random.random(n_vars).astype(dtype) - 1
+    #print local_fields
+    if printinfo:
+        print("Local fields (random):", local_fields)
+
+    return couplings, local_fields
+
+
+def test_sequential_sweep_simulation_qubo(n_vars: int):
+
+    n_replicas = 3
+    print("n_vars =",n_vars)
+    print("n_replicas =",n_replicas)
+
+
+    # Generate random problem
+    couplings, local_fields = gen_random_problem(n_vars, dtype=dtype,printinfo=True)
+
+    # Find minimum energy by bruteforce
+    min_energy,best_state_bruteforce = get_min_energy(couplings, local_fields)
+
+    # Fix temperature
+    betas = np.array([10]*n_replicas, dtype=dtype)
+    print("Betas =",betas)
+    beta_idx = np.arange(n_replicas)
+    print("Initial beta_idx =",beta_idx)
+    # Get initial state
+    states = np.random.randint(2, size=(n_replicas, n_vars)).astype(dtype)
+    print("Initial states =")
+    for s in states:
+        print(s)
+
+    # Compute energies
+    for s in states:
+        assert np.all(np.isin(s, [0, 1])), "State contains values other than 0 and 1"
+    energies = np.array(
+        [get_energy(couplings, local_fields, state) for state in states])
+    print("Initial energies=",energies)
+
+    # Simulate
+    print("beta_idx =",beta_idx)
+    nsweeps = 2
+    (state, energy, _, _), (best_state, best_energy, _, _) = simulation_parallel_x(
+        sequential_sweep_immediate, 
+        couplings, 
+        local_fields,
+        states, 
+        energies, 
+        beta_idx, 
+        betas, 
+        nsweeps)
+
+    # Check that best energy is correct
+    #ref_best_energy = -7.9322789708332255 # dense
+    ref_best_energy = -5.079571790854985 # dense
+    ref_nsweeps = 2
+    ref_n_vars = 16
+    ref_random_seed = 8
+    # make sure parameters match reference
+    assert n_vars == ref_n_vars, f"n_vars {n_vars} does not match reference {ref_n_vars}"
+    assert nsweeps == ref_nsweeps, f"nsweeps {nsweeps} does not match reference {ref_nsweeps}"  
+    assert random_seed == ref_random_seed, f"random_seed {random_seed} does not match reference {ref_random_seed}"
+    # best_energy 
+    assert np.isclose(best_energy, ref_best_energy), f"best_energy {best_energy} does not match reference {ref_best_energy}"
+
+    assert np.all(np.isin(best_state, [0, 1])), "State contains values other than 0 and 1"
+    qubo_energy = get_energy(couplings, local_fields, best_state)
+
+    print("beta_idx(out) =",beta_idx)
+
+    print("energy      =",energy)
+    for e in energy:
+        print(e)
+    print("best_energy by solver     = ",best_energy)
+    print("best_energy by bruteforce = ",min_energy)
+
+    print("state(out)      =")
+    for s in state:
+        print(s)
+    print("best state by solver     = ",best_state)
+    print("best state by bruteforce = ",best_state_bruteforce)
+
+
+    assert (np.isclose(best_energy,
+                       get_energy(couplings, local_fields, best_state)))
+
+    # Best energy should be always larger than the minimum energy
+    assert (np.round(best_energy, 6) >= np.round(min_energy, 6))
+    #assert (np.isclose(qubo_energy, ref_energy))
+
+
+
+# ## test semantics of row-wise vs immediate update sweeps
+# def test_rowwise_vs_immediate_equivalence(
+#     nzratio=0.25,
+#     seed0 = 12345,
+#     seed1 = 20241010,
+#     n_vars = 815,
+#     beta = 10.0):
+#     """
+#     Cross-check that a row-wise sequential sweep implemented via
+#     `sequential_sweep_rowwise` (with an `update_spin` that follows the
+#     immediate-neighbor-update semantics) produces the same delta_energy and
+#     final state as `sequential_sweep_immediate` when both use the same RNG
+#     seed.
+#     """
+#     def sequential_sweep_rowwise(couplings: Matrix, local_fields: Vector,
+#                         state: State, beta: float) -> float:
+#         """
+#     Metropolis update.
+#     """
+#         def update_spin(couplings: Matrix, local_fields: Vector, state: State, pos: int, beta: float, log_r: float) -> float:
+#             """
+#             Update spin accordingly to Metropolis update.
+#             """
+#             # Ensure state is 0/1
+#             assert np.all((state == 0) | (state == 1)), "State must contain only 0 or 1 values."
+#             # Ensure pos is valid
+#             assert 0 <= pos < state.shape[0], "pos index out of bounds."
+#             # Ensure shapes are compatible
+#             assert couplings.shape[0] == couplings.shape[1], "Couplings must be a square matrix."
+#             assert couplings.shape[0] == state.shape[0], "State and couplings must have compatible shapes."
+#             assert local_fields.shape[0] == state.shape[0], "Local fields and state must have compatible shapes."
+
+#             # Get the negate delta energy (qubo)
+#             # delta_n_energy = (2. * state[pos] - 1.) * (couplings[pos].dot(state) + local_fields[pos]) 
+#             # # we need to rewrite this for sparse matrix couplings, ie couplings[pos] for dense matrix becomes couplings.getrow(pos).toarray()[0]
+#             delta_n_energy = (2. * state[pos] - 1.) * (couplings.getrow(pos).toarray()[0].dot(state) + local_fields[pos])
+
+#             # Metropolis update
+#             if delta_n_energy >= 0 or log_r < beta * delta_n_energy:
+#                 # Update spin (qubo)
+#                 state[pos] = 0 if state[pos] else 1
+#                 # Return delta energy
+#                 return -delta_n_energy
+#             else:
+#                 # Otherwise, return no change in energy
+#                 return 0.
+
+#         # Get random numbers
+#         log_r = np.log(np.random.random(size=len(state)))
+
+#         # Try to update every spin
+#         delta_energy = 0.
+#         for pos in range(len(state)):
+#             delta_energy += update_spin(couplings, local_fields, state, pos, beta,log_r[pos])
+
+#         return delta_energy
+
+#     np.random.seed(seed0)
+#     couplings, local_fields = gen_random_problem(n_vars, dtype=dtype, nzratio=nzratio)
+
+
+#     # initial state
+#     init_state = np.random.randint(2, size=n_vars).astype(dtype)
+
+#     # copy for both runs
+#     state_immediate = init_state.copy()
+#     state_rowwise = init_state.copy()
+
+#     np.random.seed(seed1)
+#     delta_immediate = sequential_sweep_immediate(couplings, local_fields, state_immediate, beta)
+
+#     # run rowwise with the same RNG sequence
+#     np.random.seed(seed1)
+#     delta_rowwise = sequential_sweep_rowwise(couplings, local_fields, state_rowwise, beta)
+
+#     # Compare energies and final states
+#     assert np.isclose(delta_immediate, delta_rowwise), f"delta_energy differs: immediate={delta_immediate}, rowwise={delta_rowwise}"
+#     assert np.array_equal(state_immediate, state_rowwise), f"states differ after sweep: immediate={state_immediate}, rowwise={state_rowwise}"
+#     print("Rowwise and Immediate sweeps are equivalent.")
+
+# for i in range(5):
+#     test_rowwise_vs_immediate_equivalence(seed1=i)
+
+# run several time to check parallelism
+for _ in range(4):
+    np.random.seed(random_seed)
+    test_sequential_sweep_simulation_qubo(n_vars=16)
diff --git a/tests/utils/print_vec_mat.hpp b/tests/utils/print_vec_mat.hpp
index 7a95242ed..1f9770cbf 100644
--- a/tests/utils/print_vec_mat.hpp
+++ b/tests/utils/print_vec_mat.hpp
@@ -90,7 +90,7 @@ void print_vector(
 		os << it->second;
 		(void) ++it;
 	} else if( x_size > 0 ) {
-		os << 0;
+		os << '-';
 	}
 	size_t next_nnz, position;
 	next_nnz = it == end ? limit : it->first;
@@ -100,14 +100,18 @@ void print_vector(
 		// print sequence of zeroes
 		for( ; position < zero_streak; ++position ) {
 			os << ", ";
-			os << 0;
+			os << '-';
 		}
 		if( position < limit ) {
 			os << ", ";
 			os << it->second;
 			(void) ++position;
-			(void) ++it;
-			next_nnz = it->first;
+			if( it != end ){
+				(void) ++it;
+				next_nnz = it->first;
+			}else{
+				next_nnz = limit;
+			}
 		}
 	}
 	os << std::endl << "==============" << std::endl << std::endl;