Skip to content

Commit c50c12b

Browse files
committed
feat: add profiling tracepoints to CPU kernel implementations (Part 4)
Instrument key CPU kernel entry points in src/cpu/kernels/* with tracepoints to enable lightweight runtime profiling. These tracepoints leverage the ACL_PROFILE macros and form the basis for collecting execution timing and behavior metrics. This is the first step in integrating end-to-end profiling support. Partially Resolves: COMPMID-8330 Signed-off-by: Walid Ben Romdhane <[email protected]> Change-Id: Ic51d5f0275f7ac473fc8548df2d20d7bb85f23fa Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14776 Tested-by: Arm Jenkins <[email protected]> Benchmark: Arm Jenkins <[email protected]> Reviewed-by: Andreas Flöjt <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Reviewed-by: Dennis Wildmark <[email protected]>
1 parent 98713e2 commit c50c12b

File tree

49 files changed

+224
-85
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+224
-85
lines changed

src/cpu/kernels/floor/neon/fp16.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2021 Arm Limited.
2+
* Copyright (c) 2020-2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,6 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26+
#include "src/common/utils/profile/acl_profile.h"
2627
#include "src/common/utils/Validate.h"
2728
#include "src/core/NEON/NEMath.h"
2829

@@ -38,6 +39,7 @@ constexpr int step = 8;
3839

3940
void fp16_neon_floor(const void *src, void *dst, int len)
4041
{
42+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fp16_neon_floor");
4143
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
4244
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
4345
ARM_COMPUTE_ASSERT(len >= 0);

src/cpu/kernels/floor/neon/fp32.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2021 Arm Limited.
2+
* Copyright (c) 2020-2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -21,6 +21,7 @@
2121
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222
* SOFTWARE.
2323
*/
24+
#include "src/common/utils/profile/acl_profile.h"
2425
#include "src/common/utils/Validate.h"
2526
#include "src/core/NEON/NEMath.h"
2627

@@ -36,6 +37,7 @@ constexpr int step = 4;
3637

3738
void fp32_neon_floor(const void *src, void *dst, int len)
3839
{
40+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fp32_neon_floor");
3941
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
4042
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
4143
ARM_COMPUTE_ASSERT(len >= 0);

src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2023 Arm Limited.
2+
* Copyright (c) 2021-2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,6 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26+
#include "src/common/utils/profile/acl_profile.h"
2627
#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
2728

2829
namespace arm_compute
@@ -40,8 +41,9 @@ void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
4041
float epsilon,
4142
const Window &window)
4243
{
43-
return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
44-
bn_var, bn_beta, bn_gamma, epsilon, window);
44+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fused_batch_normalization_conv_f16");
45+
fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, bn_var,
46+
bn_beta, bn_gamma, epsilon, window);
4547
}
4648

4749
void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
@@ -55,8 +57,10 @@ void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
5557
float epsilon,
5658
const Window &window)
5759
{
58-
return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
59-
bn_var, bn_beta, bn_gamma, epsilon, window);
60+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
61+
"fused_batch_normalization_dwc_nchw_f16");
62+
fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, bn_var,
63+
bn_beta, bn_gamma, epsilon, window);
6064
}
6165
} // namespace cpu
6266
} // namespace arm_compute

src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2022 Arm Limited.
2+
* Copyright (c) 2021-2022, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
2627

2728
namespace arm_compute
@@ -39,8 +40,9 @@ void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
3940
float epsilon,
4041
const Window &window)
4142
{
42-
return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
43-
bn_var, bn_beta, bn_gamma, epsilon, window);
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fused_batch_normalization_conv_f32");
44+
fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, bn_var,
45+
bn_beta, bn_gamma, epsilon, window);
4446
}
4547
} // namespace cpu
4648
} // namespace arm_compute

src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018-2023 Arm Limited.
2+
* Copyright (c) 2018-2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
2627

2728
namespace arm_compute
@@ -39,8 +40,10 @@ void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
3940
float epsilon,
4041
const Window &window)
4142
{
42-
return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
43-
bn_var, bn_beta, bn_gamma, epsilon, window);
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
44+
"fused_batch_normalization_dwc_nchw_f32");
45+
fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, bn_var,
46+
bn_beta, bn_gamma, epsilon, window);
4447
}
4548

4649
} // namespace cpu

src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023 Arm Limited.
2+
* Copyright (c) 2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -26,6 +26,7 @@
2626
#include "arm_compute/core/ITensor.h"
2727
#include "arm_compute/core/TensorInfo.h"
2828

29+
#include "src/common/utils/profile/acl_profile.h"
2930
#include "src/core/CPP/Validate.h"
3031
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
3132
#include "src/core/NEON/wrapper/wrapper.h"
@@ -46,6 +47,8 @@ void fp16_batch_normalization_nchw_non_fused(const Window &window,
4647
float epsilon,
4748
ActivationLayerInfo act_info)
4849
{
50+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
51+
"fp16_batch_normalization_nchw_non_fused");
4952
batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>(window, input, output, mean, var, beta,
5053
gamma, epsilon, act_info);
5154
}
@@ -60,6 +63,8 @@ void fp16_batch_normalization_nchw_non_fused_relu(const Window &window,
6063
float epsilon,
6164
ActivationLayerInfo act_info)
6265
{
66+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
67+
"fp16_batch_normalization_nchw_non_fused_relu");
6368
batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>(window, input, output, mean, var, beta, gamma,
6469
epsilon, act_info);
6570
}
@@ -74,6 +79,8 @@ void fp16_batch_normalization_nchw_non_fused_brelu(const Window &window,
7479
float epsilon,
7580
ActivationLayerInfo act_info)
7681
{
82+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
83+
"fp16_batch_normalization_nchw_non_fused_brelu");
7784
batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>(window, input, output, mean, var, beta,
7885
gamma, epsilon, act_info);
7986
}
@@ -88,6 +95,8 @@ void fp16_batch_normalization_nchw_non_fused_lubrelu(const Window &window,
8895
float epsilon,
8996
ActivationLayerInfo act_info)
9097
{
98+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
99+
"fp16_batch_normalization_nchw_non_fused_lubrelu");
91100
batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>(window, input, output, mean, var, beta,
92101
gamma, epsilon, act_info);
93102
}

src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023 Arm Limited.
2+
* Copyright (c) 2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -24,6 +24,7 @@
2424
#include "arm_compute/core/ITensor.h"
2525
#include "arm_compute/core/TensorInfo.h"
2626

27+
#include "src/common/utils/profile/acl_profile.h"
2728
#include "src/core/CPP/Validate.h"
2829
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
2930
#include "src/core/NEON/wrapper/wrapper.h"
@@ -44,6 +45,8 @@ void fp32_batch_normalization_nchw_non_fused(const Window &window,
4445
float epsilon,
4546
ActivationLayerInfo act_info)
4647
{
48+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
49+
"fp32_batch_normalization_nchw_non_fused");
4750
batch_normalization_nchw<float, false, detail::dummy<float, 4>>(window, input, output, mean, var, beta, gamma,
4851
epsilon, act_info);
4952
}
@@ -58,6 +61,8 @@ void fp32_batch_normalization_nchw_non_fused_relu(const Window &window,
5861
float epsilon,
5962
ActivationLayerInfo act_info)
6063
{
64+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
65+
"fp32_batch_normalization_nchw_non_fused_relu");
6166
batch_normalization_nchw<float, true, detail::relu<float, 4>>(window, input, output, mean, var, beta, gamma,
6267
epsilon, act_info);
6368
}
@@ -72,6 +77,8 @@ void fp32_batch_normalization_nchw_non_fused_brelu(const Window &window,
7277
float epsilon,
7378
ActivationLayerInfo act_info)
7479
{
80+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
81+
"fp32_batch_normalization_nchw_non_fused_brelu");
7582
batch_normalization_nchw<float, true, detail::brelu<float, 4>>(window, input, output, mean, var, beta, gamma,
7683
epsilon, act_info);
7784
}
@@ -86,6 +93,8 @@ void fp32_batch_normalization_nchw_non_fused_lubrelu(const Window &window,
8693
float epsilon,
8794
ActivationLayerInfo act_info)
8895
{
96+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
97+
"fp32_batch_normalization_nchw_non_fused_lubrelu");
8998
batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>(window, input, output, mean, var, beta, gamma,
9099
epsilon, act_info);
91100
}

src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2022 Arm Limited.
2+
* Copyright (c) 2021-2022, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,6 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26+
#include "src/common/utils/profile/acl_profile.h"
2627
#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
2728
#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h"
2829

@@ -41,8 +42,10 @@ void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
4142
float epsilon,
4243
const Window &window)
4344
{
44-
return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
45-
bn_var, bn_beta, bn_gamma, epsilon, window);
45+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
46+
"fused_batch_normalization_dwc_nhwc_f16");
47+
fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, bn_var,
48+
bn_beta, bn_gamma, epsilon, window);
4649
}
4750

4851
} // namespace cpu

src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2022 Arm Limited.
2+
* Copyright (c) 2021-2022, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
2627
#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h"
2728

@@ -40,8 +41,10 @@ void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
4041
float epsilon,
4142
const Window &window)
4243
{
43-
return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
44-
bn_var, bn_beta, bn_gamma, epsilon, window);
44+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
45+
"fused_batch_normalization_dwc_nhwc_f32");
46+
fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, bn_var,
47+
bn_beta, bn_gamma, epsilon, window);
4548
}
4649

4750
} // namespace cpu

src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2023 Arm Limited.
2+
* Copyright (c) 2022-2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,6 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26+
#include "src/common/utils/profile/acl_profile.h"
2627
#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
2728

2829
#include <arm_neon.h>
@@ -78,7 +79,8 @@ void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window,
7879
} // namespace
7980
void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
8081
{
81-
return matrix_addition_f16(src, dst, window, beta);
82+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_fp16_gemm_matrix_add");
83+
matrix_addition_f16(src, dst, window, beta);
8284
}
8385
} // namespace cpu
8486
} // namespace arm_compute

0 commit comments

Comments
 (0)