CodeLinaro
diff --git a/‎onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc‎
Lines changed: 159 additions & 113 deletions b/‎onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc‎
Lines changed: 159 additions & 113 deletions
@@ -14,10 +14,6 @@ class SoftmaxOpBuilder : public BaseOpBuilder {
   SoftmaxOpBuilder() : BaseOpBuilder("SoftmaxOpBuilder") {}
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SoftmaxOpBuilder);
 
-  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                       const NodeUnit& node_unit,
-                       const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;
-
  protected:
   Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                        const NodeUnit& node_unit,
@@ -37,34 +33,25 @@ constexpr int32_t GetDefaultAxisAttribute(int opset_version) {
   return opset_version < 13 ? 1 : -1;
 }
 
-Status SoftmaxOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                                       const NodeUnit& node_unit,
-                                       const logging::Logger& logger) const {
-  ORT_UNUSED_PARAMETER(logger);
-  const int opset_version = node_unit.SinceVersion();
+std::vector<uint32_t> FlattenShapeFromAxis(std::vector<uint32_t>& input_shape, int32_t axis) {
+  /*
+  Return the shape with all dimensions multiplied onward from the specified axis. If axis is 0, the returned shape
+  will include an additional batch of size 1 as the first dimension.
+  */
+  assert(axis >= 0 && axis < input_shape.size());
+  std::vector<uint32_t> output_shape(input_shape.begin(), input_shape.begin() + axis);
 
-  // The QNN HTP backend only supports an `axis` attribute that refers to the last input dimension.
-  // QNN EP is able to support arbitrary axis attributes by wrapping the QNN operator with transposes.
-  // However, the exception is Softmax/LogSoftmax with opset < 13. For these older ONNX operators, only
-  // axis == input_rank - 1 is supported.
-  if (opset_version < 13) {
-    const std::string& op_type = node_unit.OpType();
-
-    int32_t axis = GetDefaultAxisAttribute(opset_version);
-    Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
-    ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis));
-    std::vector<uint32_t> input_shape;
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(node_unit.Inputs()[0].node_arg, input_shape),
-                      "QNN EP: Cannot get shape for Softmax input");
-    ORT_RETURN_IF(axis != static_cast<int32_t>(input_shape.size() - 1),
-                  "QNN ", op_type.c_str(),
-                  " only supports an `axis` attribute equal to input_rank-1 (or -1) for ONNX opset < 13");
+  if (axis == 0) {
+    output_shape.push_back(1);  // Additional batch included
   }
+  output_shape.push_back(
+    std::accumulate(input_shape.begin() + axis, input_shape.end(), 1, std::multiplies<uint32_t>())
+  );
 
-  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
+  return output_shape;
 }
 
-static std::vector<uint32_t> GetTransposePermToUseLastAxis(uint32_t input_rank, uint32_t axis) {
+std::vector<uint32_t> GetTransposePermToUseLastAxis(uint32_t input_rank, uint32_t axis) {
   assert(axis < input_rank);
   std::vector<uint32_t> transpose_perm;
   transpose_perm.reserve(input_rank);
@@ -87,58 +74,86 @@ Status SoftmaxOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                        bool do_op_validation) const {
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   const auto& inputs = node_unit.Inputs();
+  const std::string& input_name = inputs[0].node_arg.Name();
   assert(inputs.size() == 1);
 
-  int32_t axis = GetDefaultAxisAttribute(node_unit.SinceVersion());
+  const int opset_version = node_unit.SinceVersion();
+  int32_t axis = GetDefaultAxisAttribute(opset_version);
   Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
   ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis));
 
   TensorInfo input_info = {};
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input_info));
-  const size_t input_rank = input_info.shape.size();
-
-  // If the axis attribute refers to the last dimension, then process the input as normal.
-  if (!is_npu_backend || axis == static_cast<int32_t>(input_rank) - 1) {
-    return ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names);
-  }
-
-  //
-  // The axis does **not** refer to the last input dimension. Must wrap transposes around the operator to be able to use
-  // QNN's Softmax operator, which always uses an axis value that refers to the last dimension.
-  //
-
-  std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(input_rank),
-                                                                       static_cast<uint32_t>(axis));
+  size_t input_rank = input_info.shape.size();
+  ORT_RETURN_IF(input_info.is_initializer, "QNN EP does not support (Log)Softmax with an initializer input, ",
+                "which should be optimized away by the ORT optimizer");
 
-  const std::string& input_name = inputs[0].node_arg.Name();
-  std::string op_input_name = input_info.is_initializer ? input_name : input_name + "_ort_qnn_ep_transpose";
-  input_names.push_back(op_input_name);
+  /*
+  For Onnx Softmax with opset < 13, its behavior is to flatten the input starting from the axis, and perform
+  softmax operation along the axis dimension, then reshape back to the original input shape.
+  QNN EP is able to support arbitrary axis attribute by wrapping reshapes around the operator.
 
-  std::vector<uint32_t> op_input_shape = input_info.shape;
-  op_input_shape[input_rank - 1] = input_info.shape[axis];
-  op_input_shape[axis] = input_info.shape[input_rank - 1];
+  Here provides an example:
+  Given an input with shape=(3, 4, 5) and axis=1. Its behavior is to reshape the input to (3, 20), perform softmax,
+  and then reshape back to (3, 4, 5).
 
-  ORT_RETURN_IF(input_info.is_initializer, "QNN EP does not support (Log)Softmax with an initializer input, ",
-                "which should be optimized away by the ORT optimizer");
+  When axis equals 0, the reshape output shape includes an additional batch of size 1 as the first dimension.
+  Here provides an example:
+  Given an input with shape=(3, 4, 5) and axis=0. Its behavior is to reshape the input to (1, 60), perform softmax,
+  and then reshape back to (3, 4, 5).
+  */
+  if (opset_version < 13) {
+    std::string reshape_output_name = input_name + "_ort_qnn_ep_reshape";
+    std::vector<uint32_t> reshape_output_shape = FlattenShapeFromAxis(input_info.shape, axis);
 
-  // Input is dynamic, so add transpose node before input.
-  const bool is_graph_input = qnn_model_wrapper.IsGraphInput(input_name);
+    // Input is dynamic, so add reshape node before input.
+    const bool is_graph_input = qnn_model_wrapper.IsGraphInput(input_name);
 
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
-                                                         input_name,
-                                                         op_input_name,
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(input_name,
+                                                         reshape_output_name,
                                                          input_info.shape,
-                                                         transpose_perm,
-                                                         op_input_shape,
+                                                         reshape_output_shape,
                                                          input_info.qnn_data_type,
                                                          input_info.quant_param,
                                                          do_op_validation,
-                                                         is_graph_input));
-
-  Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(op_input_name);
-  QnnTensorWrapper input_tensorwrapper(op_input_name, tensor_type, input_info.qnn_data_type,
-                                       std::move(input_info.quant_param), std::move(op_input_shape), {});
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
+                                                         is_graph_input,
+                                                         false));
+    input_names.push_back(reshape_output_name);
+  }
+  /*
+  For Onnx Softmax with opset >= 13, the QNN HTP backend only supports the axis attribute that refers to the last
+  input dimension.
+  QNN EP is able to support arbitrary axis attribute by wrapping transposes around the operator.
+  */
+  else if (is_npu_backend && axis != static_cast<int32_t>(input_rank) - 1) {
+    std::string transpose_output_name = input_name + "_ort_qnn_ep_transpose";
+    std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(input_rank),
+                                                                         static_cast<uint32_t>(axis));
+
+    std::vector<uint32_t> transpose_output_shape = input_info.shape;
+    transpose_output_shape[input_rank - 1] = input_info.shape[axis];
+    transpose_output_shape[axis] = input_info.shape[input_rank - 1];
+
+    // Input is dynamic, so add transpose node before input.
+    const bool is_graph_input = qnn_model_wrapper.IsGraphInput(input_name);
+
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
+                                                           input_name,
+                                                           transpose_output_name,
+                                                           input_info.shape,
+                                                           transpose_perm,
+                                                           transpose_output_shape,
+                                                           input_info.qnn_data_type,
+                                                           input_info.quant_param,
+                                                           do_op_validation,
+                                                           is_graph_input,
+                                                           false));
+    input_names.push_back(transpose_output_name);
+  }
+  // Process the input as normal.
+  else {
+    return ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names);
+  }
 
   return Status::OK();
 }
@@ -151,76 +166,107 @@ Status SoftmaxOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   const std::string& op_type = node_unit.OpType();
   const auto& outputs = node_unit.Outputs();
+  const std::string& orig_output_name = outputs[0].node_arg.Name();
   assert(outputs.size() == 1);
 
-  int32_t axis = GetDefaultAxisAttribute(node_unit.SinceVersion());
+  const int opset_version = node_unit.SinceVersion();
+  int32_t axis = GetDefaultAxisAttribute(opset_version);
   Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
   ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis));
 
   TensorInfo output_info = {};
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(outputs[0], output_info));
-  const size_t output_rank = output_info.shape.size();
-  const bool axis_is_last_dim = static_cast<size_t>(axis) == output_rank - 1;
+  size_t output_rank = output_info.shape.size();
 
-  // If axis refers to the last dimension, process outputs as usual.
-  if (!is_npu_backend || axis_is_last_dim) {
-    QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
+  if (opset_version < 13) {
+    std::string reshape_input_name = orig_output_name + "_ort_qnn_ep_reshape";
 
+    std::vector<uint32_t> reshape_input_shape = FlattenShapeFromAxis(output_info.shape, axis);
+    if (axis == 0) {
+      // Override axis due to the inserted batch=1 to the first dimension
+      axis_qnn_scalar.uint32Value = 1;
+    }
+
+    QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
     std::vector<std::string> param_tensor_names;
     param_tensor_names.push_back(axis_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
 
-    return ProcessOutputs(qnn_model_wrapper, node_unit,
-                          std::move(input_names),
-                          std::move(param_tensor_names),
-                          logger, do_op_validation, GetQnnOpType(op_type));
-  }
-
-  //
-  // The axis **does** not refer to the last dimension. Must wrap the operator with Transposes to be able to use
-  // QNN's Softmax operator, which only supports an axis that refers to the last dimension.
-  //
-
-  axis_qnn_scalar.uint32Value = static_cast<uint32_t>(output_rank - 1);  // NOTE: override axis.
-  QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
-
-  std::vector<std::string> param_tensor_names;
-  param_tensor_names.push_back(axis_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
-
-  const std::string& orig_output_name = outputs[0].node_arg.Name();
-  std::string op_output_name = orig_output_name + "_ort_qnn_ep_transpose";
-
-  std::vector<uint32_t> op_output_shape = output_info.shape;
-  op_output_shape[output_rank - 1] = output_info.shape[axis];
-  op_output_shape[axis] = output_info.shape[output_rank - 1];
-
-  QnnTensorWrapper output_tensorwrapper(op_output_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type,
-                                        output_info.quant_param.Copy(), std::vector<uint32_t>(op_output_shape));
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit),
-                                                    QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                    GetQnnOpType(node_unit.OpType()),
-                                                    std::move(input_names),
-                                                    {op_output_name},
-                                                    std::move(param_tensor_names)),
-                    "Failed to add node.");
-
-  const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(orig_output_name);
-  std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(output_rank),
-                                                                       static_cast<uint32_t>(axis));
-
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
-                                                         op_output_name,
+    QnnTensorWrapper output_tensorwrapper(reshape_input_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type,
+                                          output_info.quant_param.Copy(), std::vector<uint32_t>(reshape_input_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit),
+                                                      QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                      GetQnnOpType(node_unit.OpType()),
+                                                      std::move(input_names),
+                                                      {reshape_input_name},
+                                                      std::move(param_tensor_names)),
+                      "Failed to add node.");
+
+    const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(orig_output_name);
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(reshape_input_name,
                                                          orig_output_name,
-                                                         op_output_shape,
-                                                         transpose_perm,
+                                                         reshape_input_shape,
                                                          output_info.shape,
                                                          output_info.qnn_data_type,
                                                          output_info.quant_param,
                                                          do_op_validation,
                                                          false,
                                                          is_graph_output));
+  }
+  else if (is_npu_backend && axis != static_cast<int32_t>(output_rank) - 1) {
+    std::string transpose_input_name = orig_output_name + "_ort_qnn_ep_transpose";
+
+    std::vector<uint32_t> transpose_input_shape = output_info.shape;
+    transpose_input_shape[output_rank - 1] = output_info.shape[axis];
+    transpose_input_shape[axis] = output_info.shape[output_rank - 1];
+
+    // Override axis due to the actual shape after the inserted transpose node
+    axis_qnn_scalar.uint32Value = static_cast<uint32_t>(output_rank) - 1;
+
+    QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
+    std::vector<std::string> param_tensor_names;
+    param_tensor_names.push_back(axis_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
+
+    QnnTensorWrapper output_tensorwrapper(transpose_input_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type,
+                                          output_info.quant_param.Copy(), std::vector<uint32_t>(transpose_input_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit),
+                                                      QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                      GetQnnOpType(node_unit.OpType()),
+                                                      std::move(input_names),
+                                                      {transpose_input_name},
+                                                      std::move(param_tensor_names)),
+                      "Failed to add node.");
+
+    const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(orig_output_name);
+    std::vector<uint32_t> transpose_perm = GetTransposePermToUseLastAxis(static_cast<uint32_t>(output_rank),
+                                                                         static_cast<uint32_t>(axis));
+
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
+                                                           transpose_input_name,
+                                                           orig_output_name,
+                                                           transpose_input_shape,
+                                                           transpose_perm,
+                                                           output_info.shape,
+                                                           output_info.qnn_data_type,
+                                                           output_info.quant_param,
+                                                           do_op_validation,
+                                                           false,
+                                                           is_graph_output));
+  }
+  else {
+    QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
+    std::vector<std::string> param_tensor_names;
+    param_tensor_names.push_back(axis_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
+
+    return ProcessOutputs(qnn_model_wrapper, node_unit,
+                          std::move(input_names),
+                          std::move(param_tensor_names),
+                          logger, do_op_validation, GetQnnOpType(op_type));
+  }
 
   return Status::OK();
 }