Algebraic-Programming
diff --git a/‎tests/performance/CMakeLists.txt‎
Lines changed: 8 additions & 1 deletion b/‎tests/performance/CMakeLists.txt‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tests/performance/bench_kernels.c‎
Lines changed: 31 additions & 13 deletions b/‎tests/performance/bench_kernels.c‎
Lines changed: 31 additions & 13 deletions
diff --git a/‎tests/performance/bench_kernels.h‎
Lines changed: 8 additions & 1 deletion b/‎tests/performance/bench_kernels.h‎
Lines changed: 8 additions & 1 deletion
@@ -26,17 +26,24 @@ set( TEST_CATEGORY "performance" )
 add_library( bench_kernels OBJECT bench_kernels.c bench_kernels.h )
 add_library( bench_kernels_omp OBJECT bench_kernels.c bench_kernels.h )
 target_compile_definitions( bench_kernels_omp PRIVATE BENCH_KERNELS_OPENMP )
+target_link_libraries( bench_kernels PRIVATE test_performance_flags )
+target_link_libraries( bench_kernels_omp PRIVATE test_performance_flags OpenMP::OpenMP_C )
 
 add_grb_executables( fma fma.cpp $<TARGET_OBJECTS:bench_kernels>
 	BACKENDS reference NO_BACKEND_NAME
 	ADDITIONAL_LINK_LIBRARIES "rt"
 )
 
-add_grb_executables( fma-openmp fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
+add_grb_executables( fma-blocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
 	BACKENDS reference_omp NO_BACKEND_NAME
 	ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
 )
 
+add_grb_executables( fma-nonblocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
+	BACKENDS nonblocking NO_BACKEND_NAME
+	ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
+)
+
 add_grb_executables( reduce reduce.cpp $<TARGET_OBJECTS:bench_kernels>
 	BACKENDS reference NO_BACKEND_NAME
 )
 
@@ -21,6 +21,8 @@
 
 #ifdef BENCH_KERNELS_OPENMP
 
+bool bench_kernels_parallel() { return true; }
+
 void bench_kernels_axpy(
 	double * restrict a,
 	const double alpha, const double * restrict x,
@@ -30,9 +32,25 @@ void bench_kernels_axpy(
 	assert( a != x );
 	assert( a != y );
 	assert( x != y );
-	#pragma omp parallel for schedule(static,8)
-	for( size_t i = 0; i < n; ++i ) {
-		a[ i ] = alpha * x[ i ] + y[ i ];
+	#pragma omp parallel
+	{
+		const size_t P = omp_get_num_threads();
+		const size_t s = omp_get_thread_num();
+		const size_t chunk = (n % P == 0) ? (n/P) : (n/P) + 1;
+		size_t start = chunk * s;
+		if( start > n - 1 ) {
+			start = n - 1;
+		}
+		size_t end = start + chunk;
+		if( end > n ) {
+			end = n;
+		}
+		assert( start <= end );
+		if( start != end ) {
+			for( size_t i = start; i < end; ++i ) {
+				a[ i ] = alpha * x[ i ] + y[ i ];
+			}
+		}
 	}
 }
 
@@ -45,7 +63,8 @@ void bench_kernels_dot(
 	assert( alpha != xr );
 	assert( alpha != yr );
 	*alpha = xr[ n - 1 ] * yr[ n - 1];
-	#pragma omp parallel
+	double global_alpha = 0;
+	#pragma omp parallel reduction(+:global_alpha)
 	{
 		const size_t P = omp_get_num_threads();
 		const size_t s = omp_get_thread_num();
@@ -64,20 +83,19 @@ void bench_kernels_dot(
 			for( size_t i = start; i < end - 1; ++i ) {
 				local_alpha += xr[ i ] * yr[ i ];
 			}
-			#pragma omp critical
-			{
-				*alpha += local_alpha;
-			}
+			global_alpha += local_alpha;
 		}
 	}
+	*alpha += global_alpha;
 }
 
 void bench_kernels_reduce(
 	double * restrict const alpha, const double * restrict xr, const size_t n
 ) {
 	assert( alpha != xr );
 	*alpha = xr[ n - 1 ];
-	#pragma omp parallel
+	double global_alpha = 0.0;
+	#pragma omp parallel reduction(+:global_alpha)
 	{
 		const size_t P = omp_get_num_threads();
 		const size_t s = omp_get_thread_num();
@@ -96,16 +114,16 @@ void bench_kernels_reduce(
 			for( size_t i = start; i < end - 1; ++i ) {
 				local_alpha += xr[ i ];
 			}
-			#pragma omp critical
-			{
-				*alpha += local_alpha;
-			}
+			global_alpha += local_alpha;
 		}
 	}
+	*alpha += global_alpha;
 }
 
 #else
 
+bool bench_kernels_parallel() { return false; }
+
 void bench_kernels_axpy(
 	double * restrict a,
 	const double alpha, const double * restrict x,
 
@@ -18,7 +18,7 @@
 
 #include <omp.h>
 #include <assert.h>
-#include <stddef.h> //for size_t
+#include <stddef.h> // for size_t
 
 
 #ifdef __cplusplus
@@ -41,10 +41,14 @@ extern "C" {
 		double * __restrict__ const, const double * __restrict__, const size_t
 	);
 
+	bool bench_kernels_parallel();
+
 }
 
 #else
 
+#include <stdbool.h> // for bool
+
 /**
  * Executes \f$ a = \alpha x + y \f$ for \a a, \a x, and \a y vectors of
  * length \a n.
@@ -89,5 +93,8 @@ void bench_kernels_reduce(
 	double * restrict const alpha, const double * restrict x, const size_t n
 );
 
+/** @returns Whether the kernels defined here are (shared-memory) parallel. */
+bool bench_kernels_parallel();
+
 #endif