ARM-software
diff --git a/‎arm_compute/core/utils/misc/ShapeCalculator.h‎
Lines changed: 11 additions & 11 deletions b/‎arm_compute/core/utils/misc/ShapeCalculator.h‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎src/cpu/kernels/CpuIm2ColKernel.cpp‎
Lines changed: 48 additions & 49 deletions b/‎src/cpu/kernels/CpuIm2ColKernel.cpp‎
Lines changed: 48 additions & 49 deletions
diff --git a/‎src/cpu/kernels/CpuIm2ColKernel.h‎
Lines changed: 20 additions & 20 deletions b/‎src/cpu/kernels/CpuIm2ColKernel.h‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎src/cpu/kernels/directconv2d/impl.h‎
Lines changed: 15 additions & 19 deletions b/‎src/cpu/kernels/directconv2d/impl.h‎
Lines changed: 15 additions & 19 deletions
@@ -575,14 +575,14 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
 
 /** Calculate the im2col output shape of a tensor
  *
- * @param[in] input             Input tensor info
- * @param[in] kernel_dims       The kernel dimensions (width and height).
- * @param[in] conv_info         Contains padding and stride information
- * @param[in] has_bias          In case biases are provided expands the matrix with 1
- * @param[in] dilation          Dilation, in elements, across x and y
- * @param[in] batch_size_on_z   True if batch size is on z axis
- * @param[in] num_groups        (Optional) Number of groups when performing a grouped convolution
- * @param[in] channel_pad_right (Optional) Amount of padding applied to the channel dimension
+ * @param[in] input           Input tensor info
+ * @param[in] kernel_dims     The kernel dimensions (width and height).
+ * @param[in] conv_info       Contains padding and stride information
+ * @param[in] has_bias        In case biases are provided expands the matrix with 1
+ * @param[in] dilation        Dilation, in elements, across x and y
+ * @param[in] batch_size_on_z True if batch size is on z axis
+ * @param[in] num_groups      (Optional)  Number of groups when performing a grouped convolution
+ * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
  *
  * @return the calculated shape
  */
@@ -592,8 +592,8 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
                                              bool                 has_bias,
                                              const Size2D        &dilation,
                                              bool                 batch_size_on_z,
-                                             unsigned int         num_groups        = 1,
-                                             unsigned int         channel_pad_right = 0)
+                                             unsigned int         num_groups      = 1,
+                                             unsigned int         input_pad_right = 0)
 {
     // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ]                           if batch_size_on_z == true
     //                       or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ]  if batch_size_on_z == false
@@ -611,7 +611,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
 
     std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
         output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    output_shape.set(0, ((output_shape[channel_idx] + channel_pad_right) / num_groups * kernel_dims.area() +
+    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
                          (has_bias ? 1 : 0))); // NOLINT
     output_shape.set(1, (out_dims.first * out_dims.second));
     if (batch_size_on_z && output_shape.num_dimensions() >= 3)
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, 2025 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,26 +67,26 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel);
     /** Set the input and output of the kernel.
      *
-     * @param[in]  src               The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                               while every optional dimension from 4 and above represent a batch of inputs.
-     *                               Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                               Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[out] dst               The output tensor info. Data types supported: Same as @p input
-     * @param[in]  kernel_dims       The kernel dimensions (width and height).
-     * @param[in]  conv_info         Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias          In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation          (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups        (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     * @param[in]  channel_pad_right (Optional) Amount of padding applied to the channel dimension.
+     * @param[in]  src             The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
+     *                             Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
+     * @param[out] dst             The output tensor info. Data types supported: Same as @p input
+     * @param[in]  kernel_dims     The kernel dimensions (width and height).
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
+     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * @param[in]  input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
      */
     void configure(const ITensorInfo   *src,
                    ITensorInfo         *dst,
                    const Size2D        &kernel_dims,
                    const PadStrideInfo &conv_info,
                    bool                 has_bias,
-                   const Size2D        &dilation          = Size2D(1U, 1U),
-                   unsigned int         num_groups        = 1,
-                   unsigned int         channel_pad_right = 0);
+                   const Size2D        &dilation        = Size2D(1U, 1U),
+                   unsigned int         num_groups      = 1,
+                   unsigned int         input_pad_right = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuIm2ColKernel::configure()
@@ -98,9 +98,9 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
                            const Size2D        &kernel_dims,
                            const PadStrideInfo &conv_info,
                            bool                 has_bias,
-                           const Size2D        &dilation          = Size2D(1U, 1U),
-                           unsigned int         num_groups        = 1,
-                           unsigned int         channel_pad_right = 0);
+                           const Size2D        &dilation        = Size2D(1U, 1U),
+                           unsigned int         num_groups      = 1,
+                           unsigned int         input_pad_right = 0);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -127,15 +127,15 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
                                        std::pair<unsigned int, unsigned int> convolved_dims,
                                        const Size2D                         &kernel_dims,
                                        const Size2D                         &dilation,
-                                       uint32_t                              channel_pad_right,
+                                       uint32_t                              input_pad_right,
                                        bool                                  has_bias);
 
     Im2ColFunctionPtr                     _func{nullptr};
     std::pair<unsigned int, unsigned int> _convolved_dims{};
     PadStrideInfo                         _conv_info{};
     unsigned int                          _kernel_width{0};
     unsigned int                          _kernel_height{0};
-    unsigned int                          _channel_pad_right{0};
+    unsigned int                          _input_pad_right{0};
     bool                                  _has_bias{false};
     Size2D                                _dilation{1U, 1U};
     DataLayout                            _data_layout{DataLayout::UNKNOWN};
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, 2025 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,13 +61,13 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
                            int                  dilation_x,
                            int                  dilation_y)
 {
-    const int kernel_area = kernel_width * kernel_height;
-    const int x_e         = top_left_x + kernel_width * dilation_x;
-    const int y_e         = top_left_y + kernel_height * dilation_y;
+    const int kernel_size2 = kernel_width * kernel_height;
+    const int x_e          = top_left_x + kernel_width * dilation_x;
+    const int y_e          = top_left_y + kernel_height * dilation_y;
 
     // Linearize volume
     int d = 0;
-    // This for loop linearizes a volume with 3 slices. This allows:
+    // This for loop linearize a volume with 3 slices. This allows:
     // 1) to reduce the iterations of the outer for loop "d"
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
     for (; d <= (kernel_depth - 3); d += 3)
@@ -79,9 +79,9 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
                 // All the values will be the offset (will be zeros when not quantized)
                 for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    *(out_ptr + 0 * kernel_area) = pad_value;
-                    *(out_ptr + 1 * kernel_area) = pad_value;
-                    *(out_ptr + 2 * kernel_area) = pad_value;
+                    *(out_ptr + 0 * kernel_size2) = pad_value;
+                    *(out_ptr + 1 * kernel_size2) = pad_value;
+                    *(out_ptr + 2 * kernel_size2) = pad_value;
                 }
             }
             else
@@ -90,23 +90,23 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
                 {
                     if ((x < 0 || x >= input_w) && has_pads)
                     {
-                        *(out_ptr + 0 * kernel_area) = pad_value;
-                        *(out_ptr + 1 * kernel_area) = pad_value;
-                        *(out_ptr + 2 * kernel_area) = pad_value;
+                        *(out_ptr + 0 * kernel_size2) = pad_value;
+                        *(out_ptr + 1 * kernel_size2) = pad_value;
+                        *(out_ptr + 2 * kernel_size2) = pad_value;
                     }
                     else
                     {
-                        *(out_ptr + 0 * kernel_area) = *(reinterpret_cast<const T *>(
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
                             in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_area) = *(reinterpret_cast<const T *>(
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
                             in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_area) = *(reinterpret_cast<const T *>(
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
                             in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
         }
-        out_ptr += 2 * kernel_area;
+        out_ptr += 2 * kernel_size2;
     }
 
     // Left over
@@ -252,7 +252,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
             for (int e = 0; e < kernel_width; e++)
             {
                 memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
-                memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
                 out_ptr += input_c + pad_right;
             }
         }
@@ -279,7 +278,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
                     {
                         memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
                                channel_chunk_size);
-                        memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
                         out_ptr += input_c + pad_right;
                     }
                 }
@@ -291,7 +289,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
                 {
                     memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
                            channel_chunk_size);
-                    memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
                     out_ptr += input_c + pad_right;
                 }
             }
@@ -362,7 +359,6 @@ void run_im2col(const ITensor                        *src,
             // Linearize volume
             if (is_nchw)
             {
-                ARM_COMPUTE_ERROR_ON(input_pad_right > 0);
                 linearize_volume_nchw<T, has_pads>(
                     input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w,
                     input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y());
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2022-2023, 2025 Arm Limited.`
	`2`	`+ * Copyright (c) 2022-2023 Arm Limited.`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -61,13 +61,13 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,`
`61`	`61`	`int dilation_x,`
`62`	`62`	`int dilation_y)`
`63`	`63`	`{`
`64`		`- const int kernel_area = kernel_width * kernel_height;`
`65`		`- const int x_e = top_left_x + kernel_width * dilation_x;`
`66`		`- const int y_e = top_left_y + kernel_height * dilation_y;`
	`64`	`+ const int kernel_size2 = kernel_width * kernel_height;`
	`65`	`+ const int x_e = top_left_x + kernel_width * dilation_x;`
	`66`	`+ const int y_e = top_left_y + kernel_height * dilation_y;`
`67`	`67`
`68`	`68`	`// Linearize volume`
`69`	`69`	`int d = 0;`
`70`		`- // This for loop linearizes a volume with 3 slices. This allows:`
	`70`	`+ // This for loop linearize a volume with 3 slices. This allows:`
`71`	`71`	`// 1) to reduce the iterations of the outer for loop "d"`
`72`	`72`	`// 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs`
`73`	`73`	`for (; d <= (kernel_depth - 3); d += 3)`
`@@ -79,9 +79,9 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,`
`79`	`79`	`// All the values will be the offset (will be zeros when not quantized)`
`80`	`80`	`for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)`
`81`	`81`	`{`
`82`		`- (out_ptr + 0 kernel_area) = pad_value;`
`83`		`- (out_ptr + 1 kernel_area) = pad_value;`
`84`		`- (out_ptr + 2 kernel_area) = pad_value;`
	`82`	`+ (out_ptr + 0 kernel_size2) = pad_value;`
	`83`	`+ (out_ptr + 1 kernel_size2) = pad_value;`
	`84`	`+ (out_ptr + 2 kernel_size2) = pad_value;`
`85`	`85`	`}`
`86`	`86`	`}`
`87`	`87`	`else`
`@@ -90,23 +90,23 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,`
`90`	`90`	`{`
`91`	`91`	`if ((x < 0 \|\| x >= input_w) && has_pads)`
`92`	`92`	`{`
`93`		`- (out_ptr + 0 kernel_area) = pad_value;`
`94`		`- (out_ptr + 1 kernel_area) = pad_value;`
`95`		`- (out_ptr + 2 kernel_area) = pad_value;`
	`93`	`+ (out_ptr + 0 kernel_size2) = pad_value;`
	`94`	`+ (out_ptr + 1 kernel_size2) = pad_value;`
	`95`	`+ (out_ptr + 2 kernel_size2) = pad_value;`
`96`	`96`	`}`
`97`	`97`	`else`
`98`	`98`	`{`
`99`		`- (out_ptr + 0 kernel_area) = (reinterpret_cast<const T >(`
	`99`	`+ (out_ptr + 0 kernel_size2) = (reinterpret_cast<const T >(`
`100`	`100`	`in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));`
`101`		`- (out_ptr + 1 kernel_area) = (reinterpret_cast<const T >(`
	`101`	`+ (out_ptr + 1 kernel_size2) = (reinterpret_cast<const T >(`
`102`	`102`	`in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));`
`103`		`- (out_ptr + 2 kernel_area) = (reinterpret_cast<const T >(`
	`103`	`+ (out_ptr + 2 kernel_size2) = (reinterpret_cast<const T >(`
`104`	`104`	`in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));`
`105`	`105`	`}`
`106`	`106`	`}`
`107`	`107`	`}`
`108`	`108`	`}`
`109`		`- out_ptr += 2 * kernel_area;`
	`109`	`+ out_ptr += 2 * kernel_size2;`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`// Left over`
`@@ -252,7 +252,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,`
`252`	`252`	`for (int e = 0; e < kernel_width; e++)`
`253`	`253`	`{`
`254`	`254`	`memcpy(out_ptr, reinterpret_cast<const T >(offset_ptr + e channel_chunk_size), channel_chunk_size);`
`255`		`- memset(static_cast<void >(out_ptr + input_c), pad_value, pad_right element_size);`
`256`	`255`	`out_ptr += input_c + pad_right;`
`257`	`256`	`}`
`258`	`257`	`}`
`@@ -279,7 +278,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,`
`279`	`278`	`{`
`280`	`279`	`memcpy(out_ptr, reinterpret_cast<const T >(in_ptr + (y input_stride_z + x * input_stride_y)),`
`281`	`280`	`channel_chunk_size);`
`282`		`- memset(static_cast<void >(out_ptr + input_c), pad_value, pad_right element_size);`
`283`	`281`	`out_ptr += input_c + pad_right;`
`284`	`282`	`}`
`285`	`283`	`}`
`@@ -291,7 +289,6 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,`
`291`	`289`	`{`
`292`	`290`	`memcpy(out_ptr, reinterpret_cast<const T >(offset_ptr + e channel_chunk_size),`
`293`	`291`	`channel_chunk_size);`
`294`		`- memset(static_cast<void >(out_ptr + input_c), pad_value, pad_right element_size);`
`295`	`292`	`out_ptr += input_c + pad_right;`
`296`	`293`	`}`
`297`	`294`	`}`
`@@ -362,7 +359,6 @@ void run_im2col(const ITensor *src,`
`362`	`359`	`// Linearize volume`
`363`	`360`	`if (is_nchw)`
`364`	`361`	`{`
`365`		`- ARM_COMPUTE_ERROR_ON(input_pad_right > 0);`
`366`	`362`	`linearize_volume_nchw<T, has_pads>(`
`367`	`363`	`input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w,`
`368`	`364`	`input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y());`