Skip to content

Commit 98713e2

Browse files
committed
feat: add profiling tracepoints to CPU kernel implementations (Part 3)
Instrument key CPU kernel entry points in src/cpu/kernels/* with tracepoints to enable lightweight runtime profiling. These tracepoints leverage the ACL_PROFILE macros and form the basis for collecting execution timing and behavior metrics. This is the first step in integrating end-to-end profiling support. Partially Resolves: COMPMID-8330 Signed-off-by: Walid Ben Romdhane <[email protected]> Change-Id: Ib711ec988ce847082f5b962737dadae5c126dcbe Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14853 Benchmark: Arm Jenkins <[email protected]> Reviewed-by: Andreas Flöjt <[email protected]> Tested-by: Arm Jenkins <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Reviewed-by: Dennis Wildmark <[email protected]>
1 parent 7c78c40 commit 98713e2

File tree

46 files changed

+245
-97
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+245
-97
lines changed

src/cpu/kernels/cast/generic/neon/fp16.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2016-2024 Arm Limited.
2+
* Copyright (c) 2016-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -26,6 +26,7 @@
2626
#include "arm_compute/core/CPP/CPPTypes.h"
2727
#include "arm_compute/core/TensorInfo.h"
2828

29+
#include "src/common/utils/profile/acl_profile.h"
2930
#include "src/cpu/kernels/cast/list.h"
3031
#include "src/cpu/kernels/CpuCastKernel.h"
3132
#include "support/SaturateCast.h"
@@ -39,6 +40,7 @@ namespace cpu
3940
void neon_qasymm8_signed_to_fp16_cast(
4041
const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
4142
{
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_qasymm8_signed_to_fp16_cast");
4244
ARM_COMPUTE_UNUSED(info);
4345
ARM_COMPUTE_UNUSED(_policy);
4446

@@ -85,6 +87,7 @@ void neon_qasymm8_signed_to_fp16_cast(
8587
void neon_s32_to_fp16_cast(
8688
const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
8789
{
90+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_s32_to_fp16_cast");
8891
ARM_COMPUTE_UNUSED(info);
8992
ARM_COMPUTE_UNUSED(_policy);
9093

@@ -133,6 +136,7 @@ void neon_s32_to_fp16_cast(
133136
void neon_fp32_to_fp16_cast(
134137
const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
135138
{
139+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_fp32_to_fp16_cast");
136140
ARM_COMPUTE_UNUSED(info);
137141
ARM_COMPUTE_UNUSED(_policy);
138142

@@ -180,6 +184,7 @@ void neon_fp32_to_fp16_cast(
180184
void neon_fp16_to_other_dt_cast(
181185
const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
182186
{
187+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_fp16_to_other_dt_cast");
183188
ARM_COMPUTE_UNUSED(info);
184189
ARM_COMPUTE_UNUSED(_policy);
185190

@@ -329,6 +334,7 @@ void neon_fp16_to_other_dt_cast(
329334
void neon_u8_to_fp16_cast(
330335
const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
331336
{
337+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "neon_u8_to_fp16_cast");
332338
ARM_COMPUTE_UNUSED(info);
333339
ARM_COMPUTE_UNUSED(_policy);
334340

@@ -372,7 +378,6 @@ void neon_u8_to_fp16_cast(
372378
}
373379
},
374380
src, dst);
375-
return;
376381
}
377382

378383
} // namespace cpu

src/cpu/kernels/conv3d/generic/neon/fp16.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Arm Limited.
2+
* Copyright (c) 2024-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h"
2627

2728
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
@@ -39,6 +40,7 @@ void directconv3d_fp16_neon_ndhwc(const ITensor *src0,
3940
const Conv3dInfo &conv_info,
4041
const Window &window)
4142
{
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "directconv3d_fp16_neon_ndhwc");
4244
directconv3d_float_neon_ndhwc<float16_t>(src0, src1, src2, dst, conv_info, window);
4345
}
4446

src/cpu/kernels/conv3d/generic/neon/fp32.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Arm Limited.
2+
* Copyright (c) 2024-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/conv3d/generic/neon/float_impl.h"
2627

2728
namespace arm_compute
@@ -38,6 +39,7 @@ void directconv3d_fp32_neon_ndhwc(const ITensor *src0,
3839
const Conv3dInfo &conv_info,
3940
const Window &window)
4041
{
42+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "directconv3d_fp32_neon_ndhwc");
4143
directconv3d_float_neon_ndhwc<float>(src0, src1, src2, dst, conv_info, window);
4244
}
4345

src/cpu/kernels/conv3d/generic/neon/qasymm8.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Arm Limited.
2+
* Copyright (c) 2024-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h"
2627

2728
namespace arm_compute
@@ -38,6 +39,7 @@ void directconv3d_qu8_neon_ndhwc(const ITensor *src0,
3839
const Conv3dInfo &conv_info,
3940
const Window &window)
4041
{
42+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "directconv3d_qu8_neon_ndhwc");
4143
directconv3d_quantized_neon_ndhwc<uint8_t>(src0, src1, src2, dst, conv_info, window);
4244
}
4345

src/cpu/kernels/conv3d/generic/neon/qasymm8_signed.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Arm Limited.
2+
* Copyright (c) 2024-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/conv3d/generic/neon/quantized_impl.h"
2627

2728
namespace arm_compute
@@ -38,6 +39,7 @@ void directconv3d_qs8_neon_ndhwc(const ITensor *src0,
3839
const Conv3dInfo &conv_info,
3940
const Window &window)
4041
{
42+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "directconv3d_qs8_neon_ndhwc");
4143
directconv3d_quantized_neon_ndhwc<int8_t>(src0, src1, src2, dst, conv_info, window);
4244
}
4345

src/cpu/kernels/crop/generic/neon/fp16.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Arm Limited.
2+
* Copyright (c) 2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,6 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26+
#include "src/common/utils/profile/acl_profile.h"
2627
#include "src/cpu/kernels/crop/generic/neon/impl.h"
2728

2829
namespace arm_compute
@@ -39,8 +40,9 @@ void fp16_in_bounds_crop_window(const ITensor *input,
3940
bool input_has_single_channel,
4041
bool is_width_flipped)
4142
{
42-
return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
43-
output_width_limit, input_has_single_channel, is_width_flipped);
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fp16_in_bounds_crop_window");
44+
in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
45+
output_width_limit, input_has_single_channel, is_width_flipped);
4446
}
4547
} // namespace cpu
4648
} // namespace arm_compute

src/cpu/kernels/crop/generic/neon/fp32.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Arm Limited.
2+
* Copyright (c) 2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/crop/generic/neon/impl.h"
2627

2728
namespace arm_compute
@@ -38,8 +39,9 @@ void fp32_in_bounds_crop_window(const ITensor *input,
3839
bool input_has_single_channel,
3940
bool is_width_flipped)
4041
{
41-
return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
42-
output_width_limit, input_has_single_channel, is_width_flipped);
42+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "fp32_in_bounds_crop_window");
43+
in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
44+
output_width_limit, input_has_single_channel, is_width_flipped);
4345
}
4446
} // namespace cpu
4547
} // namespace arm_compute

src/cpu/kernels/crop/generic/neon/integer.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021-2022 Arm Limited.
2+
* Copyright (c) 2021-2022, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,6 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25+
#include "src/common/utils/profile/acl_profile.h"
2526
#include "src/cpu/kernels/crop/generic/neon/impl.h"
2627
#include "src/cpu/kernels/crop/list.h"
2728

@@ -39,8 +40,9 @@ void u8_in_bounds_crop_window(const ITensor *input,
3940
bool input_has_single_channel,
4041
bool is_width_flipped)
4142
{
42-
return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
43-
output_width_limit, input_has_single_channel, is_width_flipped);
43+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "u8_in_bounds_crop_window");
44+
in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
45+
output_width_limit, input_has_single_channel, is_width_flipped);
4446
}
4547

4648
void u16_in_bounds_crop_window(const ITensor *input,
@@ -53,8 +55,9 @@ void u16_in_bounds_crop_window(const ITensor *input,
5355
bool input_has_single_channel,
5456
bool is_width_flipped)
5557
{
56-
return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
57-
output_width_limit, input_has_single_channel, is_width_flipped);
58+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "u16_in_bounds_crop_window");
59+
in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
60+
output_width_limit, input_has_single_channel, is_width_flipped);
5861
}
5962

6063
void u32_in_bounds_crop_window(const ITensor *input,
@@ -67,8 +70,9 @@ void u32_in_bounds_crop_window(const ITensor *input,
6770
bool input_has_single_channel,
6871
bool is_width_flipped)
6972
{
70-
return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
71-
output_width_limit, input_has_single_channel, is_width_flipped);
73+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "u32_in_bounds_crop_window");
74+
in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
75+
output_width_limit, input_has_single_channel, is_width_flipped);
7276
}
7377

7478
void s8_in_bounds_crop_window(const ITensor *input,
@@ -81,8 +85,9 @@ void s8_in_bounds_crop_window(const ITensor *input,
8185
bool input_has_single_channel,
8286
bool is_width_flipped)
8387
{
84-
return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
85-
output_width_limit, input_has_single_channel, is_width_flipped);
88+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "s8_in_bounds_crop_window");
89+
in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
90+
output_width_limit, input_has_single_channel, is_width_flipped);
8691
}
8792

8893
void s16_in_bounds_crop_window(const ITensor *input,
@@ -95,8 +100,9 @@ void s16_in_bounds_crop_window(const ITensor *input,
95100
bool input_has_single_channel,
96101
bool is_width_flipped)
97102
{
98-
return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
99-
output_width_limit, input_has_single_channel, is_width_flipped);
103+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "s16_in_bounds_crop_window");
104+
in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
105+
output_width_limit, input_has_single_channel, is_width_flipped);
100106
}
101107

102108
void s32_in_bounds_crop_window(const ITensor *input,
@@ -109,8 +115,9 @@ void s32_in_bounds_crop_window(const ITensor *input,
109115
bool input_has_single_channel,
110116
bool is_width_flipped)
111117
{
112-
return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
113-
output_width_limit, input_has_single_channel, is_width_flipped);
118+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "s32_in_bounds_crop_window");
119+
in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
120+
output_width_limit, input_has_single_channel, is_width_flipped);
114121
}
115122
} // namespace cpu
116123
} // namespace arm_compute

src/cpu/kernels/depth_to_space/nchw/any/impl.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023 Arm Limited.
2+
* Copyright (c) 2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -24,6 +24,8 @@
2424

2525
#include "arm_compute/core/Error.h"
2626

27+
#include "src/common/utils/profile/acl_profile.h"
28+
2729
#include <cstdint>
2830
#include <cstring>
2931

@@ -41,6 +43,7 @@ void depth_to_space_nchw_any( //
4143
uintptr_t element_size,
4244
uintptr_t block_size)
4345
{
46+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "depth_to_space_nchw_any");
4447
ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
4548
ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
4649

src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023 Arm Limited.
2+
* Copyright (c) 2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -24,6 +24,8 @@
2424

2525
#include "arm_compute/core/Error.h"
2626

27+
#include "src/common/utils/profile/acl_profile.h"
28+
2729
#include <cstdint>
2830
#include <cstring>
2931

@@ -41,6 +43,7 @@ void depth_to_space_nhwc_any( //
4143
uintptr_t element_size,
4244
uintptr_t block_size)
4345
{
46+
ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "depth_to_space_nhwc_any");
4447
ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
4548
ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
4649

0 commit comments

Comments
 (0)