Skip to content

Commit 5df0f3e

Browse files
feat(core): Add simple-to-use per-function performance counters
Powered by HPX's performance counter library. Since this library is only built if networking != none, guard against it being missing.
1 parent 08c1eed commit 5df0f3e

File tree

6 files changed

+189
-26
lines changed

6 files changed

+189
-26
lines changed

core/include/gprat/cpu/adapter_cblas_fp32.hpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
#include "gprat/detail/config.hpp"
77
#include "gprat/tile_data.hpp"
88

9-
#include <hpx/future.hpp>
10-
#include <vector>
9+
#include <span>
1110

1211
GPRAT_NS_BEGIN
1312

@@ -96,11 +95,11 @@ trsv(const const_tile_data<float> &L, const mutable_tile_data<float> &a, int N,
9695

9796
/**
9897
* @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a
99-
* @param f_A update matrix
100-
* @param f_a update vector
101-
* @param f_b base vector
98+
* @param A update matrix
99+
* @param a update vector
100+
* @param b base vector
102101
* @param N matrix dimension
103-
* @param alpha add or substract update to base vector
102+
* @param alpha add or subtract update to base vector
104103
* @param transpose_A transpose update matrix
105104
* @return updated vector f_b
106105
*/
@@ -140,17 +139,17 @@ mutable_tile_data<float> dot_diag_gemm(
140139

141140
/**
142141
* @brief FP32 AXPY: y - x
143-
* @param f_y left vector
144-
* @param f_x right vector
142+
* @param y left vector
143+
* @param x right vector
145144
* @param N vector length
146145
* @return y - x
147146
*/
148147
mutable_tile_data<float> axpy(const mutable_tile_data<float> &y, const const_tile_data<float> &x, int N);
149148

150149
/**
151150
* @brief FP32 Dot product: a * b
152-
* @param f_a left vector
153-
* @param f_b right vector
151+
* @param a left vector
152+
* @param b right vector
154153
* @param N vector length
155154
* @return f_a * f_b
156155
*/

core/include/gprat/cpu/adapter_cblas_fp64.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
#include "gprat/detail/config.hpp"
77
#include "gprat/tile_data.hpp"
88

9-
#include <hpx/future.hpp>
10-
#include <vector>
9+
#include <span>
1110

1211
GPRAT_NS_BEGIN
1312

core/include/gprat/performance_counters.hpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,68 @@
55

66
#include "gprat/detail/config.hpp"
77

8+
#include <atomic>
89
#include <cstddef>
910
#include <cstdint>
11+
#include <hpx/modules/assertion.hpp>
12+
#include <hpx/timing/high_resolution_timer.hpp>
13+
#include <hpx/util/get_and_reset_value.hpp>
1014

1115
GPRAT_NS_BEGIN
1216

17+
/// The following is a very simple way of defining per-function metrics by using the function itself as a template
18+
/// parameter ensuring that each function receives exactly one instantiation.
19+
template <auto F>
20+
struct function_performance_metrics
21+
{
22+
/// Number of times the function was called
23+
static std::atomic<std::uint64_t> num_calls;
24+
25+
/// Total wall-clock time elapsed inside the function
26+
static std::atomic<std::uint64_t> elapsed_ns;
27+
};
28+
29+
template <auto F>
30+
/*static*/ std::atomic<std::uint64_t> function_performance_metrics<F>::num_calls(0);
31+
template <auto F>
32+
/*static*/ std::atomic<std::uint64_t> function_performance_metrics<F>::elapsed_ns(0);
33+
34+
/// @brief This RAII helper allows us to time a function's total wall-clock execution time with minimal code.
35+
struct scoped_function_timer
36+
{
37+
explicit scoped_function_timer(std::atomic<std::uint64_t> &num_calls, std::atomic<std::uint64_t> &total) :
38+
total(total)
39+
{
40+
++num_calls;
41+
}
42+
43+
~scoped_function_timer()
44+
{
45+
const auto elapsed = timer.elapsed_nanoseconds();
46+
HPX_ASSERT(elapsed >= 0);
47+
if (elapsed > 0)
48+
{
49+
total += static_cast<std::uint64_t>(elapsed);
50+
}
51+
}
52+
53+
std::atomic<std::uint64_t> &total;
54+
hpx::chrono::high_resolution_timer timer;
55+
};
56+
57+
/// @brief Time the execution of the enclosing function from the current point to its end.
58+
/// @param local_function The function key that we're collecting performance information for. Usually the enclosing
59+
/// function.
60+
#define GPRAT_TIME_FUNCTION(local_function) \
61+
scoped_function_timer _gprat_fn_timer(function_performance_metrics<local_function>::num_calls, \
62+
function_performance_metrics<local_function>::elapsed_ns)
63+
64+
template <auto F>
65+
std::uint64_t get_and_reset_function_elapsed(bool reset)
66+
{
67+
return hpx::util::get_and_reset_value(function_performance_metrics<F>::elapsed_ns, reset);
68+
}
69+
1370
void track_tile_data_allocation(std::size_t size);
1471
void track_tile_data_deallocation(std::size_t size);
1572

core/src/cpu/adapter_cblas_fp32.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
#include "gprat/cpu/adapter_cblas_fp32.hpp"
22

3+
#include "gprat/performance_counters.hpp"
4+
5+
#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
6+
#include <hpx/performance_counters/manage_counter_type.hpp>
7+
#endif
8+
39
#ifdef GPRAT_ENABLE_MKL
410
// MKL CBLAS and LAPACKE
511
#include "mkl_cblas.h"
@@ -15,6 +21,7 @@ GPRAT_NS_BEGIN
1521

1622
mutable_tile_data<float> potrf(const mutable_tile_data<float> &A, const int N)
1723
{
24+
GPRAT_TIME_FUNCTION(&potrf);
1825
// POTRF: in-place Cholesky decomposition of A
1926
// use spotrf2 recursive version for better stability
2027
LAPACKE_spotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
@@ -29,8 +36,8 @@ trsm(const const_tile_data<float> &L,
2936
const int M,
3037
const BLAS_TRANSPOSE transpose_L,
3138
const BLAS_SIDE side_L)
32-
3339
{
40+
GPRAT_TIME_FUNCTION(&trsm);
3441
// TRSM constants
3542
const float alpha = 1.0;
3643
// TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
@@ -52,6 +59,7 @@ trsm(const const_tile_data<float> &L,
5259

5360
mutable_tile_data<float> syrk(const mutable_tile_data<float> &A, const const_tile_data<float> &B, const int N)
5461
{
62+
GPRAT_TIME_FUNCTION(&syrk);
5563
// SYRK constants
5664
const float alpha = -1.0;
5765
const float beta = 1.0;
@@ -71,6 +79,7 @@ gemm(const const_tile_data<float> &A,
7179
const BLAS_TRANSPOSE transpose_A,
7280
const BLAS_TRANSPOSE transpose_B)
7381
{
82+
GPRAT_TIME_FUNCTION(&gemm);
7483
// GEMM constants
7584
const float alpha = -1.0;
7685
const float beta = 1.0;
@@ -99,6 +108,7 @@ gemm(const const_tile_data<float> &A,
99108
mutable_tile_data<float>
100109
trsv(const const_tile_data<float> &L, const mutable_tile_data<float> &a, const int N, const BLAS_TRANSPOSE transpose_L)
101110
{
111+
GPRAT_TIME_FUNCTION(&trsv);
102112
// TRSV: In-place solve L(^T) * x = a where L lower triangular
103113
cblas_strsv(CblasRowMajor,
104114
CblasLower,
@@ -122,6 +132,7 @@ gemv(const const_tile_data<float> &A,
122132
const BLAS_ALPHA alpha,
123133
const BLAS_TRANSPOSE transpose_A)
124134
{
135+
GPRAT_TIME_FUNCTION(&gemv);
125136
// GEMV constants
126137
// const float alpha = -1.0;
127138
const float beta = 1.0;
@@ -146,6 +157,7 @@ gemv(const const_tile_data<float> &A,
146157
mutable_tile_data<float>
147158
dot_diag_syrk(const const_tile_data<float> &A, const mutable_tile_data<float> &r, const int N, const int M)
148159
{
160+
GPRAT_TIME_FUNCTION(&dot_diag_syrk);
149161
auto r_p = r.data();
150162
auto A_p = A.data();
151163
// r = r + diag(A^T * A)
@@ -164,6 +176,7 @@ dot_diag_gemm(const const_tile_data<float> &A,
164176
const int N,
165177
const int M)
166178
{
179+
GPRAT_TIME_FUNCTION(&dot_diag_gemm);
167180
auto r_p = r.data();
168181
auto A_p = A.data();
169182
auto B_p = B.data();
@@ -179,14 +192,46 @@ dot_diag_gemm(const const_tile_data<float> &A,
179192

180193
mutable_tile_data<float> axpy(const mutable_tile_data<float> &y, const const_tile_data<float> &x, const int N)
181194
{
195+
GPRAT_TIME_FUNCTION(&axpy);
182196
cblas_saxpy(N, -1.0, x.data(), 1, y.data(), 1);
183197
return y;
184198
}
185199

186200
float dot(std::span<const float> a, std::span<const float> b, const int N)
187201
{
202+
GPRAT_TIME_FUNCTION(&dot);
188203
// DOT: a * b
189204
return cblas_sdot(N, a.data(), 1, b.data(), 1);
190205
}
191206

207+
#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
208+
namespace detail
209+
{
210+
void register_fp32_performance_counters()
211+
{
212+
// XXX: you can do this with templates, but it's quite a bit more complicated
213+
#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr) \
214+
hpx::performance_counters::install_counter_type( \
215+
name, \
216+
get_and_reset_function_elapsed<fn_expr>, \
217+
#fn_expr, \
218+
"", \
219+
hpx::performance_counters::counter_type::monotonically_increasing)
220+
221+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf32/time", &potrf);
222+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm32/time", &trsm);
223+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk32/time", &syrk);
224+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm32/time", &gemm);
225+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv32/time", &trsv);
226+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv32/time", &gemv);
227+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk32/time", &dot_diag_syrk);
228+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm32/time", &dot_diag_gemm);
229+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy32/time", &axpy);
230+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot32/time", &dot);
231+
232+
#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR
233+
}
234+
} // namespace detail
235+
#endif
236+
192237
GPRAT_NS_END

core/src/cpu/adapter_cblas_fp64.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
#include "gprat/cpu/adapter_cblas_fp64.hpp"
22

3+
#include "gprat/performance_counters.hpp"
4+
5+
#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
6+
#include <hpx/performance_counters/manage_counter_type.hpp>
7+
#endif
8+
39
#ifdef GPRAT_ENABLE_MKL
410
// MKL CBLAS and LAPACKE
511
#include "mkl_cblas.h"
@@ -15,6 +21,7 @@ GPRAT_NS_BEGIN
1521

1622
mutable_tile_data<double> potrf(const mutable_tile_data<double> &A, const int N)
1723
{
24+
GPRAT_TIME_FUNCTION(&potrf);
1825
// POTRF: in-place Cholesky decomposition of A
1926
// use dpotrf2 recursive version for better stability
2027
LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
@@ -29,8 +36,8 @@ trsm(const const_tile_data<double> &L,
2936
const int M,
3037
const BLAS_TRANSPOSE transpose_L,
3138
const BLAS_SIDE side_L)
32-
3339
{
40+
GPRAT_TIME_FUNCTION(&trsm);
3441
// TRSM constants
3542
const double alpha = 1.0;
3643
// TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
@@ -53,6 +60,7 @@ trsm(const const_tile_data<double> &L,
5360

5461
mutable_tile_data<double> syrk(const mutable_tile_data<double> &A, const const_tile_data<double> &B, const int N)
5562
{
63+
GPRAT_TIME_FUNCTION(&syrk);
5664
// SYRK constants
5765
const double alpha = -1.0;
5866
const double beta = 1.0;
@@ -72,6 +80,7 @@ gemm(const const_tile_data<double> &A,
7280
const BLAS_TRANSPOSE transpose_A,
7381
const BLAS_TRANSPOSE transpose_B)
7482
{
83+
GPRAT_TIME_FUNCTION(&gemm);
7584
// GEMM constants
7685
const double alpha = -1.0;
7786
const double beta = 1.0;
@@ -100,6 +109,7 @@ gemm(const const_tile_data<double> &A,
100109
mutable_tile_data<double> trsv(
101110
const const_tile_data<double> &L, const mutable_tile_data<double> &a, const int N, const BLAS_TRANSPOSE transpose_L)
102111
{
112+
GPRAT_TIME_FUNCTION(&trsv);
103113
// TRSV: In-place solve L(^T) * x = a where L lower triangular
104114
cblas_dtrsv(CblasRowMajor,
105115
CblasLower,
@@ -123,6 +133,7 @@ gemv(const const_tile_data<double> &A,
123133
const BLAS_ALPHA alpha,
124134
const BLAS_TRANSPOSE transpose_A)
125135
{
136+
GPRAT_TIME_FUNCTION(&gemv);
126137
// GEMV constants
127138
// const double alpha = -1.0;
128139
const double beta = 1.0;
@@ -147,6 +158,7 @@ gemv(const const_tile_data<double> &A,
147158
mutable_tile_data<double>
148159
dot_diag_syrk(const const_tile_data<double> &A, const mutable_tile_data<double> &r, const int N, const int M)
149160
{
161+
GPRAT_TIME_FUNCTION(&dot_diag_syrk);
150162
auto r_p = r.data();
151163
auto A_p = A.data();
152164
// r = r + diag(A^T * A)
@@ -165,6 +177,7 @@ dot_diag_gemm(const const_tile_data<double> &A,
165177
const int N,
166178
const int M)
167179
{
180+
GPRAT_TIME_FUNCTION(&dot_diag_gemm);
168181
auto r_p = r.data();
169182
auto A_p = A.data();
170183
auto B_p = B.data();
@@ -180,14 +193,46 @@ dot_diag_gemm(const const_tile_data<double> &A,
180193

181194
mutable_tile_data<double> axpy(const mutable_tile_data<double> &y, const const_tile_data<double> &x, const int N)
182195
{
196+
GPRAT_TIME_FUNCTION(&axpy);
183197
cblas_daxpy(N, -1.0, x.data(), 1, y.data(), 1);
184198
return y;
185199
}
186200

187201
double dot(std::span<const double> a, std::span<const double> b, const int N)
188202
{
203+
GPRAT_TIME_FUNCTION(&dot);
189204
// DOT: a * b
190205
return cblas_ddot(N, a.data(), 1, b.data(), 1);
191206
}
192207

208+
#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
209+
namespace detail
210+
{
211+
void register_fp64_performance_counters()
212+
{
213+
// XXX: you can do this with templates, but it's quite a bit more complicated
214+
#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr) \
215+
hpx::performance_counters::install_counter_type( \
216+
name, \
217+
get_and_reset_function_elapsed<fn_expr>, \
218+
#fn_expr, \
219+
"", \
220+
hpx::performance_counters::counter_type::monotonically_increasing)
221+
222+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf64/time", &potrf);
223+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm64/time", &trsm);
224+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk64/time", &syrk);
225+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm64/time", &gemm);
226+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv64/time", &trsv);
227+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv64/time", &gemv);
228+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk64/time", &dot_diag_syrk);
229+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm64/time", &dot_diag_gemm);
230+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy64/time", &axpy);
231+
GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot64/time", &dot);
232+
233+
#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR
234+
}
235+
} // namespace detail
236+
#endif
237+
193238
GPRAT_NS_END

0 commit comments

Comments
 (0)