Skip to content

Commit fcd1b0b

Browse files
committed
fix: Fill the padding area with zeros in CpuIm2ColKernel
This patch fixes an issue in the CpuIm2ColKernel introduced with the following patch: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9508 with change id equal to: I350c7a1b2dcae63f8d94f5b6f1f86e948eab1f09 It also adds relevant tests and additional error messages when the channel padding feature is not supported, together with some style changes in the places the code is touched. Resolves: COMPMID-8215 Change-Id: Ifcfe3ccaf581361913cc1a58969bf9d0c6d75163 Signed-off-by: Gunes Bayir <[email protected]> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14441 Comments-Addressed: Arm Jenkins <[email protected]> Benchmark: Arm Jenkins <[email protected]> Tested-by: Arm Jenkins <[email protected]> Reviewed-by: Syed Wajahat Abbas Naqvi <[email protected]> Reviewed-by: Yevgen Pronenko <[email protected]>
1 parent 197b6ea commit fcd1b0b

File tree

8 files changed

+441
-169
lines changed

8 files changed

+441
-169
lines changed

arm_compute/core/utils/misc/ShapeCalculator.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -575,14 +575,14 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
575575

576576
/** Calculate the im2col output shape of a tensor
577577
*
578-
* @param[in] input Input tensor info
579-
* @param[in] kernel_dims The kernel dimensions (width and height).
580-
* @param[in] conv_info Contains padding and stride information
581-
* @param[in] has_bias In case biases are provided expands the matrix with 1
582-
* @param[in] dilation Dilation, in elements, across x and y
583-
* @param[in] batch_size_on_z True if batch size is on z axis
584-
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
585-
* @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
578+
* @param[in] input Input tensor info
579+
* @param[in] kernel_dims The kernel dimensions (width and height).
580+
* @param[in] conv_info Contains padding and stride information
581+
* @param[in] has_bias In case biases are provided expands the matrix with 1
582+
* @param[in] dilation Dilation, in elements, across x and y
583+
* @param[in] batch_size_on_z True if batch size is on z axis
584+
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution
585+
* @param[in] channel_pad_right (Optional) Amount of padding applied to the channel dimension
586586
*
587587
* @return the calculated shape
588588
*/
@@ -592,8 +592,8 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input,
592592
bool has_bias,
593593
const Size2D &dilation,
594594
bool batch_size_on_z,
595-
unsigned int num_groups = 1,
596-
unsigned int input_pad_right = 0)
595+
unsigned int num_groups = 1,
596+
unsigned int channel_pad_right = 0)
597597
{
598598
// The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ] if batch_size_on_z == true
599599
// or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ] if batch_size_on_z == false
@@ -611,7 +611,7 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input,
611611

612612
std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
613613
output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
614-
output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
614+
output_shape.set(0, ((output_shape[channel_idx] + channel_pad_right) / num_groups * kernel_dims.area() +
615615
(has_bias ? 1 : 0))); // NOLINT
616616
output_shape.set(1, (out_dims.first * out_dims.second));
617617
if (batch_size_on_z && output_shape.num_dimensions() >= 3)

src/cpu/kernels/CpuIm2ColKernel.cpp

Lines changed: 49 additions & 48 deletions
Large diffs are not rendered by default.

src/cpu/kernels/CpuIm2ColKernel.h

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017-2023 Arm Limited.
2+
* Copyright (c) 2017-2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -67,26 +67,26 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
6767
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel);
6868
/** Set the input and output of the kernel.
6969
*
70-
* @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
71-
* while every optional dimension from 4 and above represent a batch of inputs.
72-
* Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
73-
* Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
74-
* @param[out] dst The output tensor info. Data types supported: Same as @p input
75-
* @param[in] kernel_dims The kernel dimensions (width and height).
76-
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
77-
* @param[in] has_bias In case biases are provided expands the matrix with 1.
78-
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
79-
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
80-
* @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
70+
* @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
71+
* while every optional dimension from 4 and above represent a batch of inputs.
72+
* Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
73+
* Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
74+
* @param[out] dst The output tensor info. Data types supported: Same as @p input
75+
* @param[in] kernel_dims The kernel dimensions (width and height).
76+
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
77+
* @param[in] has_bias In case biases are provided expands the matrix with 1.
78+
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
79+
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
80+
* @param[in] channel_pad_right (Optional) Amount of padding applied to the channel dimension.
8181
*/
8282
void configure(const ITensorInfo *src,
8383
ITensorInfo *dst,
8484
const Size2D &kernel_dims,
8585
const PadStrideInfo &conv_info,
8686
bool has_bias,
87-
const Size2D &dilation = Size2D(1U, 1U),
88-
unsigned int num_groups = 1,
89-
unsigned int input_pad_right = 0);
87+
const Size2D &dilation = Size2D(1U, 1U),
88+
unsigned int num_groups = 1,
89+
unsigned int channel_pad_right = 0);
9090
/** Static function to check if given info will lead to a valid configuration
9191
*
9292
* Similar to CpuIm2ColKernel::configure()
@@ -98,9 +98,9 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
9898
const Size2D &kernel_dims,
9999
const PadStrideInfo &conv_info,
100100
bool has_bias,
101-
const Size2D &dilation = Size2D(1U, 1U),
102-
unsigned int num_groups = 1,
103-
unsigned int input_pad_right = 0);
101+
const Size2D &dilation = Size2D(1U, 1U),
102+
unsigned int num_groups = 1,
103+
unsigned int channel_pad_right = 0);
104104

105105
// Inherited methods overridden:
106106
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -127,15 +127,15 @@ class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
127127
std::pair<unsigned int, unsigned int> convolved_dims,
128128
const Size2D &kernel_dims,
129129
const Size2D &dilation,
130-
uint32_t input_pad_right,
130+
uint32_t channel_pad_right,
131131
bool has_bias);
132132

133133
Im2ColFunctionPtr _func{nullptr};
134134
std::pair<unsigned int, unsigned int> _convolved_dims{};
135135
PadStrideInfo _conv_info{};
136136
unsigned int _kernel_width{0};
137137
unsigned int _kernel_height{0};
138-
unsigned int _input_pad_right{0};
138+
unsigned int _channel_pad_right{0};
139139
bool _has_bias{false};
140140
Size2D _dilation{1U, 1U};
141141
DataLayout _data_layout{DataLayout::UNKNOWN};

src/cpu/kernels/directconv2d/impl.h

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2023 Arm Limited.
2+
* Copyright (c) 2022-2023, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -61,13 +61,13 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
6161
int dilation_x,
6262
int dilation_y)
6363
{
64-
const int kernel_size2 = kernel_width * kernel_height;
65-
const int x_e = top_left_x + kernel_width * dilation_x;
66-
const int y_e = top_left_y + kernel_height * dilation_y;
64+
const int kernel_area = kernel_width * kernel_height;
65+
const int x_e = top_left_x + kernel_width * dilation_x;
66+
const int y_e = top_left_y + kernel_height * dilation_y;
6767

6868
// Linearize volume
6969
int d = 0;
70-
// This for loop linearize a volume with 3 slices. This allows:
70+
// This for loop linearizes a volume with 3 slices. This allows:
7171
// 1) to reduce the iterations of the outer for loop "d"
7272
// 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
7373
for (; d <= (kernel_depth - 3); d += 3)
@@ -79,9 +79,9 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
7979
// All the values will be the offset (will be zeros when not quantized)
8080
for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
8181
{
82-
*(out_ptr + 0 * kernel_size2) = pad_value;
83-
*(out_ptr + 1 * kernel_size2) = pad_value;
84-
*(out_ptr + 2 * kernel_size2) = pad_value;
82+
*(out_ptr + 0 * kernel_area) = pad_value;
83+
*(out_ptr + 1 * kernel_area) = pad_value;
84+
*(out_ptr + 2 * kernel_area) = pad_value;
8585
}
8686
}
8787
else
@@ -90,23 +90,23 @@ void linearize_volume_nchw(const uint8_t *const in_ptr,
9090
{
9191
if ((x < 0 || x >= input_w) && has_pads)
9292
{
93-
*(out_ptr + 0 * kernel_size2) = pad_value;
94-
*(out_ptr + 1 * kernel_size2) = pad_value;
95-
*(out_ptr + 2 * kernel_size2) = pad_value;
93+
*(out_ptr + 0 * kernel_area) = pad_value;
94+
*(out_ptr + 1 * kernel_area) = pad_value;
95+
*(out_ptr + 2 * kernel_area) = pad_value;
9696
}
9797
else
9898
{
99-
*(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
99+
*(out_ptr + 0 * kernel_area) = *(reinterpret_cast<const T *>(
100100
in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
101-
*(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
101+
*(out_ptr + 1 * kernel_area) = *(reinterpret_cast<const T *>(
102102
in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
103-
*(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
103+
*(out_ptr + 2 * kernel_area) = *(reinterpret_cast<const T *>(
104104
in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
105105
}
106106
}
107107
}
108108
}
109-
out_ptr += 2 * kernel_size2;
109+
out_ptr += 2 * kernel_area;
110110
}
111111

112112
// Left over
@@ -252,6 +252,7 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
252252
for (int e = 0; e < kernel_width; e++)
253253
{
254254
memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
255+
memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
255256
out_ptr += input_c + pad_right;
256257
}
257258
}
@@ -278,6 +279,7 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
278279
{
279280
memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
280281
channel_chunk_size);
282+
memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
281283
out_ptr += input_c + pad_right;
282284
}
283285
}
@@ -289,6 +291,7 @@ void linearize_volume_nhwc(const uint8_t *const in_ptr,
289291
{
290292
memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
291293
channel_chunk_size);
294+
memset(static_cast<void *>(out_ptr + input_c), pad_value, pad_right * element_size);
292295
out_ptr += input_c + pad_right;
293296
}
294297
}
@@ -359,6 +362,7 @@ void run_im2col(const ITensor *src,
359362
// Linearize volume
360363
if (is_nchw)
361364
{
365+
ARM_COMPUTE_ERROR_ON(input_pad_right > 0);
362366
linearize_volume_nchw<T, has_pads>(
363367
input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w,
364368
input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y());

0 commit comments

Comments
 (0)