[onert] Remove internal Tensor from DepthwiseConvolutionLayer (#15337)

ragmani · web-flow · commit 846ecad71021 · 2025-05-12T17:47:47.000+09:00
This commit removes internal Tensor from DepthwiseConvolutionLayer.
- Delete prepareF32() and its references in CPU op and train op
- Remove `_padded_filter`, `_filter_buffers` members and related includes
- Simplify convFloat32() to always call DepthwiseConv without extra buffers

ONE-DCO-1.0-Signed-off-by: ragmani &lt;ragmani0216@gmail.com&gt;
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
@@ -22,43 +22,6 @@
 namespace onert::backend::cpu::ops
 {
 
-void DepthwiseConvolutionLayer::prepareF32()
-{
-  if (_dilationWidth != 1 || _dilationHeight != 1 || _strideWidth != _strideHeight)
-    return;
-
-  // DepthwiseConvOp cpu kernel needs additional memory to perform with multi-
-  // threads. So, we allocate it here and pass it to the kernel.
-  const int64_t k_packet_size = nnfw::cker::eigen_support::kPacketSize<float>();
-
-  const auto out_shape = getShape(_output);
-  const auto filter_shape = getShape(_kernel);
-  const int batch = out_shape.Dims(0);
-  const int out_depth = out_shape.Dims(3);
-  const int filter_rows = filter_shape.Dims(1);
-  const int filter_cols = filter_shape.Dims(2);
-
-  const int filter_spatial_size = filter_rows * filter_cols;
-  const int padded_filter_inner_dim_size =
-    ((out_depth + k_packet_size - 1) / k_packet_size) * k_packet_size;
-
-  _use_padded_filter = (out_depth % k_packet_size) == 0 ? false : true;
-
-  // prepare padded_filter buffer for cker
-  auto padded_filter_info = ir::OperandInfo(_kernel->get_info());
-  padded_filter_info.shape({batch, filter_spatial_size, padded_filter_inner_dim_size});
-  _padded_filter = std::make_unique<Tensor>(padded_filter_info, nullptr);
-  _padded_filter->setBuffer(std::make_shared<basic::Allocator>(_padded_filter->total_size()));
-
-  // prepare out_bprop and in_bprop buffer for cker
-  const int thread_count = nnfw::cker::eigen_support::getThreadCount() + 1;
-
-  auto filter_buffers_info = ir::OperandInfo(_kernel->get_info());
-  filter_buffers_info.shape({thread_count, filter_spatial_size, padded_filter_inner_dim_size});
-  _filter_buffers = std::make_unique<Tensor>(filter_buffers_info, nullptr);
-  _filter_buffers->setBuffer(std::make_shared<basic::Allocator>(_filter_buffers->total_size()));
-}
-
 void DepthwiseConvolutionLayer::convFloat32()
 {
   float output_activation_min = 0, output_activation_max = 0;
@@ -75,23 +38,24 @@ void DepthwiseConvolutionLayer::convFloat32()
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
 
-  // Since DepthwiseConvOp does not support dilation and different W/H stride yet,
-  // it uses the existing kernel in this case.
-  if (_dilationWidth == 1 && _dilationHeight == 1 && _strideWidth == _strideHeight)
-  {
-    nnfw::cker::DepthwiseConvOp(op_params, getShape(_input), getBuffer<float>(_input),
-                                getShape(_kernel), getBuffer<float>(_kernel), getShape(_bias),
-                                getBuffer<float>(_bias), getBuffer<float>(_padded_filter.get()),
-                                _use_padded_filter, getBuffer<float>(_filter_buffers.get()),
-                                getShape(_output), getBuffer<float>(_output));
-  }
-  else
-  {
-    nnfw::cker::DepthwiseConv<float, float>(
-      op_params, getShape(_input), getBuffer<float>(_input), getShape(_kernel),
-      getBuffer<float>(_kernel), getShape(_bias), getBuffer<float>(_bias), getShape(_output),
-      getBuffer<float>(_output), _external_context->ruy_context());
-  }
+  // TODO: Use the following call if TensorBuilder manages padded_filter_data
+  //       and filter_buffers_data:
+  //
+  //         void DepthwiseConvOp(
+  //           const DepthwiseConvParams &params,
+  //           const Shape &input_shape,    const float *input_data,
+  //           const Shape &filter_shape,   const float *filter_data,
+  //           const Shape &bias_shape,     const float *bias_data,
+  //           float *padded_filter_data,    bool pad_filter,
+  //           float *filter_buffers_data,
+  //           const Shape &output_shape,    float *output_data
+  //         );
+  //
+  //       See https://github.com/Samsung/ONE/pull/13669 for an example of using DepthwiseConvOp
+  nnfw::cker::DepthwiseConv<float, float>(
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_kernel),
+    getBuffer<float>(_kernel), getShape(_bias), getBuffer<float>(_bias), getShape(_output),
+    getBuffer<float>(_output), _external_context->ruy_context());
 }
 
 void DepthwiseConvolutionLayer::convQ8uPerTensor()
@@ -309,10 +273,6 @@ void DepthwiseConvolutionLayer::configure(
     prepareQ8iHybridPerChannel();
     _prepared = true;
   }
-  else if (_input->data_type() == OperandType::FLOAT32)
-  {
-    prepareF32();
-  }
   else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
   {
     if (_kernel->is_constant() && !_input->is_dynamic() && !_output->is_dynamic())
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
@@ -20,7 +20,6 @@
 #include <backend/IPortableTensor.h>
 #include "OperationUtils.h"
 #include "../ExternalContext.h"
-#include "../Tensor.h"
 
 #include <exec/IFunction.h>
 
@@ -52,7 +51,6 @@ class DepthwiseConvolutionLayer : public ::onert::exec::IFunction
   void run() override;
 
 private:
-  void prepareF32();
   void prepareQ8i();
   void prepareQ8uPerChannel();
   void prepareQ8iHybridPerChannel();
@@ -79,10 +77,6 @@ class DepthwiseConvolutionLayer : public ::onert::exec::IFunction
 
   ir::Activation _activation{ir::Activation::NONE};
 
-  bool _use_padded_filter{false};
-  std::unique_ptr<Tensor> _padded_filter{nullptr};
-  std::unique_ptr<Tensor> _filter_buffers{nullptr};
-
 private:
   std::shared_ptr<ExternalContext> _external_context;
 
diff --git a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.cc
@@ -28,6 +28,7 @@ namespace onert::backend::train::ops
 DepthwiseConvolutionLayer::DepthwiseConvolutionLayer()
   : cpu::ops::DepthwiseConvolutionLayer(), _grad_weights{nullptr}, _grad_bias{nullptr},
     _back_prop_input{nullptr}, _back_prop_output{nullptr}, _act_back_prop_output{nullptr},
+    _use_padded_filter{false}, _padded_filter{nullptr}, _filter_buffers{nullptr},
     _filter_dim_buffers{nullptr}
 {
   // DO NOTHING
@@ -83,6 +84,26 @@ void DepthwiseConvolutionLayer::configureBackward(IPortableTensor *back_prop_inp
   _filter_dim_buffers = std::make_unique<Tensor>(filter_dim_buffers_info);
   _filter_dim_buffers->setBuffer(
     std::make_shared<basic::Allocator>(_filter_dim_buffers->total_size()));
+
+  _use_padded_filter = (out_depth % k_packet_size) == 0 ? false : true;
+
+  const auto filter_shape = getShape(_kernel);
+  const int batch = incoming_shape.Dims(0);
+
+  const int filter_rows = filter_shape.Dims(1);
+  const int filter_cols = filter_shape.Dims(2);
+  const int filter_spatial_size = filter_rows * filter_cols;
+
+  // prepare padded_filter buffer for cker
+  auto padded_filter_info = ir::OperandInfo(_kernel->get_info());
+  padded_filter_info.shape({batch, filter_spatial_size, padded_filter_inner_dim_size});
+  _padded_filter = std::make_unique<Tensor>(padded_filter_info);
+  _padded_filter->setBuffer(std::make_shared<basic::Allocator>(_padded_filter->total_size()));
+
+  auto filter_buffers_info = ir::OperandInfo(_kernel->get_info());
+  filter_buffers_info.shape({thread_count, filter_spatial_size, padded_filter_inner_dim_size});
+  _filter_buffers = std::make_unique<Tensor>(filter_buffers_info);
+  _filter_buffers->setBuffer(std::make_shared<basic::Allocator>(_filter_buffers->total_size()));
 }
 
 void DepthwiseConvolutionLayer::forward(bool) { cpu::ops::DepthwiseConvolutionLayer::run(); }
diff --git a/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/train/ops/DepthwiseConvolutionLayer.h
@@ -50,6 +50,10 @@ class DepthwiseConvolutionLayer : public ::onert::exec::train::ITrainableFunctio
 
   // TODO Consider if these tensors should be built in TensorBuilder
   std::unique_ptr<BackPropTensor> _act_back_prop_output;
+
+  bool _use_padded_filter;
+  std::unique_ptr<Tensor> _padded_filter;
+  std::unique_ptr<Tensor> _filter_buffers;
   std::unique_ptr<Tensor> _filter_dim_buffers;
 };