ARM-software
diff --git a/‎src/cpu/operators/CpuActivation.cpp‎
Lines changed: 5 additions & 1 deletion b/‎src/cpu/operators/CpuActivation.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuAdd.cpp‎
Lines changed: 5 additions & 2 deletions b/‎src/cpu/operators/CpuAdd.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/cpu/operators/CpuAddMulAdd.cpp‎
Lines changed: 5 additions & 1 deletion b/‎src/cpu/operators/CpuAddMulAdd.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuCast.cpp‎
Lines changed: 4 additions & 1 deletion b/‎src/cpu/operators/CpuCast.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuConcatenate.cpp‎
Lines changed: 5 additions & 1 deletion b/‎src/cpu/operators/CpuConcatenate.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuConv2d.cpp‎
Lines changed: 5 additions & 1 deletion b/‎src/cpu/operators/CpuConv2d.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuConvertFullyConnectedWeights.cpp‎
Lines changed: 7 additions & 1 deletion b/‎src/cpu/operators/CpuConvertFullyConnectedWeights.cpp‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuCopy.cpp‎
Lines changed: 4 additions & 1 deletion b/‎src/cpu/operators/CpuCopy.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuDepthwiseConv2d.cpp‎
Lines changed: 9 additions & 1 deletion b/‎src/cpu/operators/CpuDepthwiseConv2d.cpp‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp‎
Lines changed: 8 additions & 1 deletion b/‎src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp‎
Lines changed: 8 additions & 1 deletion
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/CpuContext.h"
 #include "src/cpu/kernels/CpuActivationKernel.h"
 
@@ -37,6 +38,7 @@ namespace cpu
 {
 void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::configure");
     ARM_COMPUTE_LOG_PARAMS(input, output, activation_info);
     auto k = std::make_unique<kernels::CpuActivationKernel>();
     k->configure(input, output, activation_info);
@@ -46,11 +48,13 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
 Status
 CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::validate");
     return kernels::CpuActivationKernel::validate(input, output, activation_info);
 }
 
 void CpuActivation::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::run");
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
     auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint();
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,8 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/kernels/CpuAddKernel.h"
-
 namespace arm_compute
 {
 namespace cpu
@@ -38,6 +38,7 @@ void CpuAdd::configure(const ITensorInfo         *src0,
                        ConvertPolicy              policy,
                        const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAdd::configure");
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
     auto k = std::make_unique<kernels::CpuAddKernel>();
@@ -51,12 +52,14 @@ Status CpuAdd::validate(const ITensorInfo         *src0,
                         ConvertPolicy              policy,
                         const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAdd::validate");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
 }
 
 void CpuAdd::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAdd::run");
     const auto split_dimension = static_cast<kernels::CpuAddKernel *>(_kernel.get())->get_split_dimension();
 
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuAddMulAddKernel.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -44,6 +45,7 @@ void CpuAddMulAdd::configure(const ITensorInfo         *input1,
                              ConvertPolicy              policy,
                              const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::configure");
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
     auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
@@ -82,6 +84,7 @@ Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
                               ConvertPolicy              policy,
                               const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::validate");
     const DataType data_type = input1->data_type();
     if (is_data_type_quantized(data_type))
     {
@@ -103,6 +106,7 @@ Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
 
 void CpuAddMulAdd::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::run");
     const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
 
     if (is_data_type_quantized(data_type))
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/kernels/CpuCastKernel.h"
 
 namespace arm_compute
@@ -32,6 +33,7 @@ namespace cpu
 {
 void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCast::configure");
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::CpuCastKernel>();
     k->configure(src, dst, policy);
@@ -40,6 +42,7 @@ void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy
 
 Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCast::validate");
     return kernels::CpuCastKernel::validate(src, dst, policy);
 }
 } // namespace cpu
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
 #include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
@@ -44,6 +45,7 @@ namespace cpu
 {
 void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::configure");
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
     ARM_COMPUTE_LOG_PARAMS(srcs_vector, dst, axis);
 
@@ -100,6 +102,7 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
 Status
 CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::validate");
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
 
@@ -146,6 +149,7 @@ CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, co
 
 void CpuConcatenate::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::run");
     if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/operators/CpuDirectConv2d.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuGemmConv2d.h"
@@ -54,6 +55,7 @@ void CpuConv2d::configure(ITensorInfo               *input,
                           bool                       enable_fast_math,
                           unsigned int               num_groups)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::configure");
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
@@ -114,6 +116,7 @@ Status CpuConv2d::validate(const ITensorInfo         *input,
                            bool                       enable_fast_math,
                            unsigned int               num_groups)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::validate");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
@@ -291,6 +294,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
 
 void CpuConv2d::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::run");
     prepare(tensors);
     _function->run(tensors);
 }
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
 
 namespace arm_compute
@@ -37,6 +38,8 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
                                                 const TensorShape &original_src_shape,
                                                 DataLayout         data_layout)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuConvertFullyConnectedWeights::configure");
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
     k->configure(src, dst, original_src_shape, data_layout);
@@ -48,11 +51,14 @@ Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
                                                  const TensorShape &original_src_shape,
                                                  DataLayout         data_layout)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuConvertFullyConnectedWeights::validate");
     return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
 
 void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConvertFullyConnectedWeights::run");
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
 }
 } // namespace cpu
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuCopy.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/kernels/CpuCopyKernel.h"
 
 namespace arm_compute
@@ -32,6 +33,7 @@ namespace cpu
 {
 void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCopy::configure");
     ARM_COMPUTE_LOG_PARAMS(src, dst);
     auto k = std::make_unique<kernels::CpuCopyKernel>();
     k->configure(src, dst);
@@ -40,6 +42,7 @@ void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
 
 Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCopy::validate");
     return kernels::CpuCopyKernel::validate(src, dst);
 }
 } // namespace cpu
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024 Arm Limited.
+ * Copyright (c) 2021-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
 namespace arm_compute
@@ -145,6 +146,8 @@ Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const I
                                                                          const ITensorInfo     *dst,
                                                                          const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate");
     return validate_arguments_optimized(src, weights, biases, dst, info);
 }
 
@@ -346,6 +349,8 @@ Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo
                                                                const ITensorInfo     *dst,
                                                                const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate");
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     if (src->data_layout() == DataLayout::NCHW)
     {
@@ -476,6 +481,7 @@ void CpuDepthwiseConv2d::configure(ITensorInfo           *src,
                                    ITensorInfo           *dst,
                                    const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::configure");
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
     _depth_conv_func =
@@ -499,6 +505,7 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo     *src,
                                     const ITensorInfo     *dst,
                                     const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::validate");
     DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
     switch (depth_conv_func)
     {
@@ -531,6 +538,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi
 
 void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::run");
     switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024 Arm Limited.
+ * Copyright (c) 2019-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/common/utils/profile/acl_profile.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/AssemblyUtils.h"
@@ -59,6 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
                                                    ITensorInfo           *dst,
                                                    const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuDepthwiseConv2dAssemblyDispatch::configure");
     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
     const CPUInfo     &ci          = NEScheduler::get().cpu_info();
     const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -88,6 +91,8 @@ Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
                                                     const ITensorInfo     *dst,
                                                     const ConvolutionInfo &info)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuDepthwiseConv2dAssemblyDispatch::validate");
     return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
 }
 
@@ -104,6 +109,8 @@ bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const Activatio
 
 void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
 {
+    ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,
+                            "CpuDepthwiseConv2dAssemblyDispatch::run");
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
 
     prepare(tensors);
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021-2022 Arm Limited.`
	`2`	`+ * Copyright (c) 2021-2022, 2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -28,6 +28,7 @@`
`28`	`28`	`#include "src/common/IOperator.h"`
`29`	`29`	`#include "src/common/utils/LegacySupport.h"`
`30`	`30`	`#include "src/common/utils/Log.h"`
	`31`	`+#include "src/common/utils/profile/acl_profile.h"`
`31`	`32`	`#include "src/cpu/CpuContext.h"`
`32`	`33`	`#include "src/cpu/kernels/CpuActivationKernel.h"`
`33`	`34`
`@@ -37,6 +38,7 @@ namespace cpu`
`37`	`38`	`{`
`38`	`39`	`void CpuActivation::configure(const ITensorInfo input, ITensorInfo output, const ActivationLayerInfo &activation_info)`
`39`	`40`	`{`
	`41`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::configure");`
`40`	`42`	`ARM_COMPUTE_LOG_PARAMS(input, output, activation_info);`
`41`	`43`	`auto k = std::make_unique<kernels::CpuActivationKernel>();`
`42`	`44`	`k->configure(input, output, activation_info);`
`@@ -46,11 +48,13 @@ void CpuActivation::configure(const ITensorInfo input, ITensorInfo output, con`
`46`	`48`	`Status`
`47`	`49`	`CpuActivation::validate(const ITensorInfo input, const ITensorInfo output, const ActivationLayerInfo &activation_info)`
`48`	`50`	`{`
	`51`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::validate");`
`49`	`52`	`return kernels::CpuActivationKernel::validate(input, output, activation_info);`
`50`	`53`	`}`
`51`	`54`
`52`	`55`	`void CpuActivation::run(ITensorPack &tensors)`
`53`	`56`	`{`
	`57`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::run");`
`54`	`58`	`ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");`
`55`	`59`	`auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint();`
`56`	`60`	`NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2023 Arm Limited.`
	`2`	`+ * Copyright (c) 2023, 2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -27,6 +27,7 @@`
`27`	`27`	`#include "arm_compute/runtime/NEON/NEScheduler.h"`
`28`	`28`
`29`	`29`	`#include "src/common/utils/Log.h"`
	`30`	`+#include "src/common/utils/profile/acl_profile.h"`
`30`	`31`	`#include "src/core/helpers/MemoryHelpers.h"`
`31`	`32`	`#include "src/cpu/kernels/CpuAddMulAddKernel.h"`
`32`	`33`	`#include "src/cpu/utils/CpuAuxTensorHandler.h"`
`@@ -44,6 +45,7 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1,`
`44`	`45`	`ConvertPolicy policy,`
`45`	`46`	`const ActivationLayerInfo &act_info)`
`46`	`47`	`{`
	`48`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::configure");`
`47`	`49`	`ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);`
`48`	`50`
`49`	`51`	`auto k = std::make_unique<kernels::CpuAddMulAddKernel>();`
`@@ -82,6 +84,7 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1,`
`82`	`84`	`ConvertPolicy policy,`
`83`	`85`	`const ActivationLayerInfo &act_info)`
`84`	`86`	`{`
	`87`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::validate");`
`85`	`88`	`const DataType data_type = input1->data_type();`
`86`	`89`	`if (is_data_type_quantized(data_type))`
`87`	`90`	`{`
`@@ -103,6 +106,7 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1,`
`103`	`106`
`104`	`107`	`void CpuAddMulAdd::run(ITensorPack &tensors)`
`105`	`108`	`{`
	`109`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuAddMulAdd::run");`
`106`	`110`	`const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();`
`107`	`111`
`108`	`112`	`if (is_data_type_quantized(data_type))`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021 Arm Limited.`
	`2`	`+ * Copyright (c) 2021, 2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -24,6 +24,7 @@`
`24`	`24`	`#include "src/cpu/operators/CpuCast.h"`
`25`	`25`
`26`	`26`	`#include "src/common/utils/Log.h"`
	`27`	`+#include "src/common/utils/profile/acl_profile.h"`
`27`	`28`	`#include "src/cpu/kernels/CpuCastKernel.h"`
`28`	`29`
`29`	`30`	`namespace arm_compute`
`@@ -32,6 +33,7 @@ namespace cpu`
`32`	`33`	`{`
`33`	`34`	`void CpuCast::configure(const ITensorInfo src, ITensorInfo dst, ConvertPolicy policy)`
`34`	`35`	`{`
	`36`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCast::configure");`
`35`	`37`	`ARM_COMPUTE_LOG_PARAMS(src, dst, policy);`
`36`	`38`	`auto k = std::make_unique<kernels::CpuCastKernel>();`
`37`	`39`	`k->configure(src, dst, policy);`
`@@ -40,6 +42,7 @@ void CpuCast::configure(const ITensorInfo src, ITensorInfo dst, ConvertPolicy`
`40`	`42`
`41`	`43`	`Status CpuCast::validate(const ITensorInfo src, const ITensorInfo dst, ConvertPolicy policy)`
`42`	`44`	`{`
	`45`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuCast::validate");`
`43`	`46`	`return kernels::CpuCastKernel::validate(src, dst, policy);`
`44`	`47`	`}`
`45`	`48`	`} // namespace cpu`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2018-2021 Arm Limited.`
	`2`	`+ * Copyright (c) 2018-2021, 2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -32,6 +32,7 @@`
`32`	`32`	`#include "arm_compute/runtime/NEON/NEScheduler.h"`
`33`	`33`
`34`	`34`	`#include "src/common/utils/Log.h"`
	`35`	`+#include "src/common/utils/profile/acl_profile.h"`
`35`	`36`	`#include "src/core/helpers/AutoConfiguration.h"`
`36`	`37`	`#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"`
`37`	`38`	`#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"`
`@@ -44,6 +45,7 @@ namespace cpu`
`44`	`45`	`{`
`45`	`46`	`void CpuConcatenate::configure(const std::vector<const ITensorInfo > &srcs_vector, ITensorInfo dst, size_t axis)`
`46`	`47`	`{`
	`48`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::configure");`
`47`	`49`	`ARM_COMPUTE_ERROR_ON(dst == nullptr);`
`48`	`50`	`ARM_COMPUTE_LOG_PARAMS(srcs_vector, dst, axis);`
`49`	`51`
`@@ -100,6 +102,7 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect`
`100`	`102`	`Status`
`101`	`103`	`CpuConcatenate::validate(const std::vector<const ITensorInfo > &srcs_vector, const ITensorInfo dst, size_t axis)`
`102`	`104`	`{`
	`105`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::validate");`
`103`	`106`	`ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);`
`104`	`107`	`ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);`
`105`	`108`
`@@ -146,6 +149,7 @@ CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, co`
`146`	`149`
`147`	`150`	`void CpuConcatenate::run(ITensorPack &tensors)`
`148`	`151`	`{`
	`152`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConcatenate::run");`
`149`	`153`	`if (tensors.empty())`
`150`	`154`	`{`
`151`	`155`	`ARM_COMPUTE_ERROR("No inputs provided");`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2017-2021, 2023-2024 Arm Limited.`
	`2`	`+ * Copyright (c) 2017-2021, 2023-2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -27,6 +27,7 @@`
`27`	`27`	`#include "arm_compute/runtime/NEON/NEScheduler.h"`
`28`	`28`
`29`	`29`	`#include "src/common/utils/Log.h"`
	`30`	`+#include "src/common/utils/profile/acl_profile.h"`
`30`	`31`	`#include "src/cpu/operators/CpuDirectConv2d.h"`
`31`	`32`	`#include "src/cpu/operators/CpuGemm.h"`
`32`	`33`	`#include "src/cpu/operators/CpuGemmConv2d.h"`
`@@ -54,6 +55,7 @@ void CpuConv2d::configure(ITensorInfo *input,`
`54`	`55`	`bool enable_fast_math,`
`55`	`56`	`unsigned int num_groups)`
`56`	`57`	`{`
	`58`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::configure");`
`57`	`59`	`// Perform validate step`
`58`	`60`	`ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);`
`59`	`61`	`ARM_COMPUTE_UNUSED(num_groups);`
`@@ -114,6 +116,7 @@ Status CpuConv2d::validate(const ITensorInfo *input,`
`114`	`116`	`bool enable_fast_math,`
`115`	`117`	`unsigned int num_groups)`
`116`	`118`	`{`
	`119`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::validate");`
`117`	`120`	`ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");`
`118`	`121`
`119`	`122`	`const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);`
`@@ -291,6 +294,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i`
`291`	`294`
`292`	`295`	`void CpuConv2d::run(ITensorPack &tensors)`
`293`	`296`	`{`
	`297`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuConv2d::run");`
`294`	`298`	`prepare(tensors);`
`295`	`299`	`_function->run(tensors);`
`296`	`300`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2021-2024 Arm Limited.`
	`2`	`+ * Copyright (c) 2021-2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -30,6 +30,7 @@`
`30`	`30`	`#include "arm_compute/runtime/NEON/NEScheduler.h"`
`31`	`31`
`32`	`32`	`#include "src/common/utils/Log.h"`
	`33`	`+#include "src/common/utils/profile/acl_profile.h"`
`33`	`34`	`#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"`
`34`	`35`
`35`	`36`	`namespace arm_compute`
`@@ -145,6 +146,8 @@ Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const I`
`145`	`146`	`const ITensorInfo *dst,`
`146`	`147`	`const ConvolutionInfo &info)`
`147`	`148`	`{`
	`149`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,`
	`150`	`+ "CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate");`
`148`	`151`	`return validate_arguments_optimized(src, weights, biases, dst, info);`
`149`	`152`	`}`
`150`	`153`
`@@ -346,6 +349,8 @@ Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo`
`346`	`349`	`const ITensorInfo *dst,`
`347`	`350`	`const ConvolutionInfo &info)`
`348`	`351`	`{`
	`352`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,`
	`353`	`+ "CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate");`
`349`	`354`	`ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);`
`350`	`355`	`if (src->data_layout() == DataLayout::NCHW)`
`351`	`356`	`{`
`@@ -476,6 +481,7 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src,`
`476`	`481`	`ITensorInfo *dst,`
`477`	`482`	`const ConvolutionInfo &info)`
`478`	`483`	`{`
	`484`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::configure");`
`479`	`485`	`ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);`
`480`	`486`
`481`	`487`	`_depth_conv_func =`
`@@ -499,6 +505,7 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src,`
`499`	`505`	`const ITensorInfo *dst,`
`500`	`506`	`const ConvolutionInfo &info)`
`501`	`507`	`{`
	`508`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::validate");`
`502`	`509`	`DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);`
`503`	`510`	`switch (depth_conv_func)`
`504`	`511`	`{`
`@@ -531,6 +538,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi`
`531`	`538`
`532`	`539`	`void CpuDepthwiseConv2d::run(ITensorPack &tensors)`
`533`	`540`	`{`
	`541`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuDepthwiseConv2d::run");`
`534`	`542`	`switch (_depth_conv_func)`
`535`	`543`	`{`
`536`	`544`	`case DepthwiseConvolutionFunction::OPTIMIZED:`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2019-2024 Arm Limited.`
	`2`	`+ * Copyright (c) 2019-2025 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -28,6 +28,7 @@`
`28`	`28`	`#include "arm_compute/runtime/NEON/NEScheduler.h"`
`29`	`29`
`30`	`30`	`#include "src/common/utils/Log.h"`
	`31`	`+#include "src/common/utils/profile/acl_profile.h"`
`31`	`32`	`#include "src/core/CPP/Validate.h"`
`32`	`33`	`#include "src/core/helpers/AutoConfiguration.h"`
`33`	`34`	`#include "src/core/utils/AssemblyUtils.h"`
`@@ -59,6 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,`
`59`	`60`	`ITensorInfo *dst,`
`60`	`61`	`const ConvolutionInfo &info)`
`61`	`62`	`{`
	`63`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,`
	`64`	`+ "CpuDepthwiseConv2dAssemblyDispatch::configure");`
`62`	`65`	`ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);`
`63`	`66`	`const CPUInfo &ci = NEScheduler::get().cpu_info();`
`64`	`67`	`const unsigned int num_threads = NEScheduler::get().num_threads();`
`@@ -88,6 +91,8 @@ Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src,`
`88`	`91`	`const ITensorInfo *dst,`
`89`	`92`	`const ConvolutionInfo &info)`
`90`	`93`	`{`
	`94`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,`
	`95`	`+ "CpuDepthwiseConv2dAssemblyDispatch::validate");`
`91`	`96`	`return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);`
`92`	`97`	`}`
`93`	`98`
`@@ -104,6 +109,8 @@ bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const Activatio`
`104`	`109`
`105`	`110`	`void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)`
`106`	`111`	`{`
	`112`	`+ ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU,`
	`113`	`+ "CpuDepthwiseConv2dAssemblyDispatch::run");`
`107`	`114`	`ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");`
`108`	`115`
`109`	`116`	`prepare(tensors);`