SC-SGS
diff --git a/‎core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎core/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/include/gprat/cpu/adapter_cblas_fp32.hpp‎
Lines changed: 50 additions & 46 deletions b/‎core/include/gprat/cpu/adapter_cblas_fp32.hpp‎
Lines changed: 50 additions & 46 deletions
diff --git a/‎core/include/gprat/cpu/adapter_cblas_fp64.hpp‎
Lines changed: 57 additions & 49 deletions b/‎core/include/gprat/cpu/adapter_cblas_fp64.hpp‎
Lines changed: 57 additions & 49 deletions
@@ -11,6 +11,7 @@ add_compile_definitions(GPRAT_WITH_CUDA=$<BOOL:${GPRAT_WITH_CUDA}>)
 set(SOURCE_FILES
     src/gprat.cpp
     src/utils.cpp
+    src/performance_counters.cpp
     src/target.cpp
     src/kernels.cpp
     src/hyperparameters.cpp
 
@@ -4,14 +4,13 @@
 #pragma once
 
 #include "gprat/detail/config.hpp"
+#include "gprat/tile_data.hpp"
 
 #include <hpx/future.hpp>
 #include <vector>
 
 GPRAT_NS_BEGIN
 
-using vector_future = hpx::shared_future<std::vector<float>>;
-
 // Constants that are compatible with CBLAS
 typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE;
 
@@ -29,69 +28,71 @@ typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA;
 
 /**
  * @brief FP32 In-place Cholesky decomposition of A
- * @param f_A matrix to be factorized
+ * @param A matrix to be factorized
  * @param N matrix dimension
  * @return factorized, lower triangular matrix f_L
  */
-vector_future potrf(vector_future f_A, const int N);
+mutable_tile_data<float> potrf(const mutable_tile_data<float> &A, int N);
 
 /**
  * @brief FP32 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_A right hand side matrix
+ * @param L Cholesky factor matrix
+ * @param A right hand side matrix
  * @param N first dimension
  * @param M second dimension
  * @return solution matrix f_X
  */
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L);
+mutable_tile_data<float>
+trsm(const const_tile_data<float> &L,
+     const mutable_tile_data<float> &A,
+     int N,
+     int M,
+     BLAS_TRANSPOSE transpose_L,
+     BLAS_SIDE side_L);
 
 /**
  * @brief FP32 Symmetric rank-k update: A = A - B * B^T
- * @param f_A Base matrix
- * @param f_B Symmetric update matrix
+ * @param A Base matrix
+ * @param B Symmetric update matrix
  * @param N matrix dimension
  * @return updated matrix f_A
  */
-vector_future syrk(vector_future f_A, vector_future f_B, const int N);
+mutable_tile_data<float> syrk(const mutable_tile_data<float> &A, const const_tile_data<float> &B, int N);
 
 /**
  * @brief FP32 General matrix-matrix multiplication: C = C - A(^T) * B(^T)
- * @param f_C Base matrix
- * @param f_B Right update matrix
- * @param f_A Left update matrix
+ * @param C Base matrix
+ * @param B Right update matrix
+ * @param A Left update matrix
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @param K third matrix dimension
  * @param transpose_A transpose left matrix
  * @param transpose_B transpose right matrix
  * @return updated matrix f_X
  */
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
-     const int N,
-     const int M,
-     const int K,
-     const BLAS_TRANSPOSE transpose_A,
-     const BLAS_TRANSPOSE transpose_B);
+mutable_tile_data<float>
+gemm(const const_tile_data<float> &A,
+     const const_tile_data<float> &B,
+     const mutable_tile_data<float> &C,
+     int N,
+     int M,
+     int K,
+     BLAS_TRANSPOSE transpose_A,
+     BLAS_TRANSPOSE transpose_B);
 
 // BLAS level 2 operations
 
 /**
  * @brief FP32 In-place solve L(^T) * x = a where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_a right hand side vector
+ * @param L Cholesky factor matrix
+ * @param a right hand side vector
  * @param N matrix dimension
  * @param transpose_L transpose Cholesky factor
  * @return solution vector f_x
  */
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L);
+mutable_tile_data<float>
+trsv(const const_tile_data<float> &L, const mutable_tile_data<float> &a, int N, BLAS_TRANSPOSE transpose_L);
 
 /**
  * @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a
@@ -103,34 +104,37 @@ vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS
  * @param transpose_A transpose update matrix
  * @return updated vector f_b
  */
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A);
+mutable_tile_data<float>
+gemv(const const_tile_data<float> &A,
+     const const_tile_data<float> &a,
+     const mutable_tile_data<float> &b,
+     int N,
+     int M,
+     BLAS_ALPHA alpha,
+     BLAS_TRANSPOSE transpose_A);
 
 /**
  * @brief FP32 Vector update with diagonal SYRK: r = r + diag(A^T * A)
- * @param f_A update matrix
- * @param f_r base vector
+ * @param A update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M);
+mutable_tile_data<float>
+dot_diag_syrk(const const_tile_data<float> &A, const mutable_tile_data<float> &r, int N, int M);
 
 /**
  * @brief FP32 Vector update with diagonal GEMM: r = r + diag(A * B)
- * @param f_A first update matrix
- * @param f_B second update matrix
- * @param f_r base vector
+ * @param A first update matrix
+ * @param B second update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M);
+mutable_tile_data<float> dot_diag_gemm(
+    const const_tile_data<float> &A, const const_tile_data<float> &B, const mutable_tile_data<float> &r, int N, int M);
 
 // BLAS level 1 operations
 
@@ -141,7 +145,7 @@ vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future
  * @param N vector length
  * @return y - x
  */
-vector_future axpy(vector_future f_y, vector_future f_x, const int N);
+mutable_tile_data<float> axpy(const mutable_tile_data<float> &y, const const_tile_data<float> &x, int N);
 
 /**
  * @brief FP32 Dot product: a * b
@@ -150,7 +154,7 @@ vector_future axpy(vector_future f_y, vector_future f_x, const int N);
  * @param N vector length
  * @return f_a * f_b
  */
-float dot(std::vector<float> a, std::vector<float> b, const int N);
+float dot(std::span<const float> a, std::span<const float> b, int N);
 
 GPRAT_NS_END
 
 
@@ -4,14 +4,13 @@
 #pragma once
 
 #include "gprat/detail/config.hpp"
+#include "gprat/tile_data.hpp"
 
 #include <hpx/future.hpp>
 #include <vector>
 
 GPRAT_NS_BEGIN
 
-using vector_future = hpx::shared_future<std::vector<double>>;
-
 // Constants that are compatible with CBLAS
 typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE;
 
@@ -29,108 +28,117 @@ typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA;
 
 /**
  * @brief FP64 In-place Cholesky decomposition of A
- * @param f_A matrix to be factorized
+ * @param A matrix to be factorized
  * @param N matrix dimension
  * @return factorized, lower triangular matrix f_L
  */
-vector_future potrf(vector_future f_A, const int N);
+mutable_tile_data<double> potrf(const mutable_tile_data<double> &A, int N);
 
 /**
  * @brief FP64 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_A right hand side matrix
+ * @param L Cholesky factor matrix
+ * @param A right hand side matrix
  * @param N first dimension
  * @param M second dimension
  * @return solution matrix f_X
  */
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L);
+mutable_tile_data<double>
+trsm(const const_tile_data<double> &L,
+     const mutable_tile_data<double> &A,
+     int N,
+     int M,
+     BLAS_TRANSPOSE transpose_L,
+     BLAS_SIDE side_L);
 
 /**
  * @brief FP64 Symmetric rank-k update: A = A - B * B^T
- * @param f_A Base matrix
- * @param f_B Symmetric update matrix
+ * @param A Base matrix
+ * @param B Symmetric update matrix
  * @param N matrix dimension
  * @return updated matrix f_A
  */
-vector_future syrk(vector_future f_A, vector_future f_B, const int N);
+mutable_tile_data<double> syrk(const mutable_tile_data<double> &A, const const_tile_data<double> &B, int N);
 
 /**
  * @brief FP64 General matrix-matrix multiplication: C = C - A(^T) * B(^T)
- * @param f_C Base matrix
- * @param f_B Right update matrix
- * @param f_A Left update matrix
+ * @param C Base matrix
+ * @param B Right update matrix
+ * @param A Left update matrix
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @param K third matrix dimension
  * @param transpose_A transpose left matrix
  * @param transpose_B transpose right matrix
  * @return updated matrix f_X
  */
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
-     const int N,
-     const int M,
-     const int K,
-     const BLAS_TRANSPOSE transpose_A,
-     const BLAS_TRANSPOSE transpose_B);
+mutable_tile_data<double>
+gemm(const const_tile_data<double> &A,
+     const const_tile_data<double> &B,
+     const mutable_tile_data<double> &C,
+     int N,
+     int M,
+     int K,
+     BLAS_TRANSPOSE transpose_A,
+     BLAS_TRANSPOSE transpose_B);
 
 // BLAS level 2 operations
 
 /**
  * @brief FP64 In-place solve L(^T) * x = a where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_a right hand side vector
+ * @param L Cholesky factor matrix
+ * @param a right hand side vector
  * @param N matrix dimension
  * @param transpose_L transpose Cholesky factor
  * @return solution vector f_x
  */
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L);
+mutable_tile_data<double>
+trsv(const const_tile_data<double> &L, const mutable_tile_data<double> &a, int N, BLAS_TRANSPOSE transpose_L);
 
 /**
  * @brief FP64 General matrix-vector multiplication: b = b - A(^T) * a
- * @param f_A update matrix
- * @param f_a update vector
- * @param f_b base vector
+ * @param A update matrix
+ * @param a update vector
+ * @param b base vector
  * @param N matrix dimension
  * @param alpha add or substract update to base vector
  * @param transpose_A transpose update matrix
  * @return updated vector f_b
  */
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A);
+mutable_tile_data<double>
+gemv(const const_tile_data<double> &A,
+     const const_tile_data<double> &a,
+     const mutable_tile_data<double> &b,
+     int N,
+     int M,
+     BLAS_ALPHA alpha,
+     BLAS_TRANSPOSE transpose_A);
 
 /**
  * @brief FP64 Vector update with diagonal SYRK: r = r + diag(A^T * A)
- * @param f_A update matrix
- * @param f_r base vector
+ * @param A update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M);
+mutable_tile_data<double>
+dot_diag_syrk(const const_tile_data<double> &A, const mutable_tile_data<double> &r, int N, int M);
 
 /**
  * @brief FP64 Vector update with diagonal GEMM: r = r + diag(A * B)
- * @param f_A first update matrix
- * @param f_B second update matrix
- * @param f_r base vector
+ * @param A first update matrix
+ * @param B second update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M);
+mutable_tile_data<double>
+dot_diag_gemm(const const_tile_data<double> &A,
+              const const_tile_data<double> &B,
+              const mutable_tile_data<double> &r,
+              int N,
+              int M);
 
 // BLAS level 1 operations
 
@@ -141,7 +149,7 @@ vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future
  * @param N vector length
  * @return y - x
  */
-vector_future axpy(vector_future f_y, vector_future f_x, const int N);
+mutable_tile_data<double> axpy(const mutable_tile_data<double> &y, const const_tile_data<double> &x, int N);
 
 /**
  * @brief FP64 Dot product: a * b
@@ -150,7 +158,7 @@ vector_future axpy(vector_future f_y, vector_future f_x, const int N);
  * @param N vector length
  * @return a * b
  */
-double dot(std::vector<double> a, std::vector<double> b, const int N);
+double dot(std::span<const double> a, std::span<const double> b, int N);
 
 GPRAT_NS_END