[QNN-EP] Add BQ and LPBQ support in QNN EP

kuanyul-quic · kuanyul-quic · commit d8b7d5c35b02 · 2025-03-19T11:06:43.000+08:00
* Description
- Add a new attribute block_size to QuantParam.
- Add a function to check if a tensor in the ONNX graph is blockwise quantized
  in QnnModelWrapper.
- Add definition and translation of BQ (Blockwise Quantization) and LPBQ
  (Low Power Blockwise Quantization) when initializing QnnQuantParamsWrapper.
- Add checks for blockwise quantization in several op builders.

* Motivation and Context
- QNN-EP currently does not support block quantization coding. Since the finer granularity
  provided by block quantization generally leads to better quantization accuracy,
  we'd like to add support for BQ and LPBQ in QNN-EP.
diff --git a/onnxruntime/core/framework/node_unit.cc b/onnxruntime/core/framework/node_unit.cc
@@ -118,8 +118,14 @@ std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::Node
         axis = entry->second.i();
       }
 
+      // Get the Q or DQ block_size attribute if available.
+      std::optional<int64_t> block_size;
+      if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
+        block_size = entry->second.i();
+      }
+
       // quantization scale and zp are always the input[1, 2]
-      NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr, axis};
+      NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr, axis, block_size};
 
       if (is_input) {
         // DQ is input to the target node, use the DstArgIndex
@@ -373,10 +379,17 @@ void NodeUnit::InitForSingleNode() {
       axis = entry->second.i();
     }
 
+    // Get the DQ block_size attribute if available.
+    std::optional<int64_t> block_size;
+    if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
+      block_size = entry->second.i();
+    }
+
     inputs_.push_back(NodeUnitIODef{*input_defs[0],
                                     NodeUnitIODef::QuantParam{*input_defs[1],
                                                               input_defs.size() == 3 ? input_defs[2] : nullptr,
-                                                              axis}});
+                                                              axis,
+                                                              block_size}});
     outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
 
   } else if (qlinear_type == QLinearOpType::QuantizeLinear) {
@@ -390,11 +403,18 @@ void NodeUnit::InitForSingleNode() {
       axis = entry->second.i();
     }
 
+    // Get the Q block_size attribute if available.
+    std::optional<int64_t> block_size;
+    if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
+      block_size = entry->second.i();
+    }
+
     inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
     outputs_.push_back(NodeUnitIODef{*output_defs[0],
                                      NodeUnitIODef::QuantParam{*input_defs[1],
                                                                input_defs.size() == 3 ? input_defs[2] : nullptr,
-                                                               axis}});
+                                                               axis,
+                                                               block_size}});
   } else if (IsVariadicQLinearOp(qlinear_type)) {
     size_t input_num = (input_defs.size() - 2) / 3;
     for (size_t i = 0; i < input_num; i++) {
diff --git a/onnxruntime/core/framework/node_unit.h b/onnxruntime/core/framework/node_unit.h
@@ -49,6 +49,7 @@ struct NodeUnitIODef {
     const NodeArg& scale;
     const NodeArg* zero_point{nullptr};
     std::optional<int64_t> axis{std::nullopt};
+    std::optional<int64_t> block_size{std::nullopt};
   };
 
   const NodeArg& node_arg;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -111,7 +111,7 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     }
   }
 
-  // Validate that weight is signed type for per-channel quantization (required by QNN docs).
+  // Validate that weight is signed type for per-channel and blockwise quantization (required by QNN docs).
   if (is_npu_backend) {
     const auto& input_1 = inputs[1];  // weight
     bool is_per_axis_quant = false;
@@ -134,6 +134,21 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
         ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization");
       }
     }
+
+    bool is_block_quant = false;
+    int64_t quant_block_axis = 0;
+    int64_t quant_block_size = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(input_1, is_block_quant, quant_block_axis, quant_block_size));
+
+    if (is_block_quant) {
+      int32_t elem_data_type = 0;
+      ORT_RETURN_IF_ERROR(utils::GetOnnxTensorElemDataType(input_1.node_arg, elem_data_type));
+
+      const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) ||
+                                  (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
+                                  (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
+      ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized blockwise");
+    }
   }
 
   return Status::OK();
@@ -237,10 +252,17 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
         std::vector<size_t> perm_inv(perm.size());
         ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
         ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
+      } else if (input_info.quant_param.IsLPBQ()) {  // Transpose quantization parameter's axis if this is using LPBQ quantization.
+        // Only Conv2d supports LPBQ.
+        ORT_RETURN_IF((conv_type != OnnxConvType::kConv) || is_3d, "Apply LPBQ only on Conv2d");
+        const std::vector<size_t> perm = nchw2hwcn_perm;
+        std::vector<size_t> perm_inv(perm.size());
+        ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
+        ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
       }
     } else {
       // Add transpose node above weight input.
-      ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
+      ORT_RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsBlockwise(),
                     "Non-constant Conv inputs only support per-tensor quantization");
       bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
       LOGS(logger, VERBOSE) << "Add HWCN Transpose node after input: " << input1_name;
@@ -350,7 +372,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       };
 
       if (!input0_info.is_initializer) {
-        ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
+        ORT_RETURN_IF(input0_info.quant_param.IsPerChannel() || input0_info.quant_param.IsBlockwise(),
                       "Non-constant Conv inputs only support per-tensor quantization");
 
         // Add Reshape node to transform 1D input to 2D (i.e., set height to 1).
@@ -465,7 +487,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       }
     } else {
       // Dynamic weight: Add nodes to reshape to 2D, and then transpose.
-      ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
+      ORT_RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsBlockwise(),
                     "Non-constant Conv inputs only support per-tensor quantization");
 
       bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -118,7 +118,8 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
     std::string input_tensor_name = input_name;
     if (1 == input_trans_flag.at(input_i) && !is_constant_input) {
-      ORT_RETURN_IF(quantize_param.IsPerChannel(), "Non-constant Gemm inputs only support per-tensor quantization");
+      ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
+                    "Non-constant Gemm inputs only support per-tensor quantization");
 
       // Add Transpose node
       std::vector<uint32_t> old_input_shape(input_shape);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -114,7 +114,7 @@ Status InstanceNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     };
 
     if (!input0_info.is_initializer) {
-      ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
+      ORT_RETURN_IF(input0_info.quant_param.IsPerChannel() || input0_info.quant_param.IsBlockwise(),
                     "Non-constant InstanceNormalization inputs only support per-tensor quantization");
 
       // Add Reshape node to transform 1D input to 2D (i.e., set height to 1).
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -69,6 +69,12 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
     ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
     ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");
 
+    bool is_block_quant = false;
+    int64_t quant_block_axis = 0;
+    int64_t quant_block_size = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(node_unit.Inputs()[0], is_block_quant, quant_block_axis, quant_block_size));
+    ORT_RETURN_IF(is_block_quant, "QNN EP does not support a standalone DQ op with blockwise quantization");
+
     if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
       ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(node_unit.Outputs()[0].node_arg.Name()),
                     "QNN EP is configured to not take DQ nodes that generate a graph output.");
@@ -81,6 +87,12 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
     ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
     ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");
 
+    bool is_block_quant = false;
+    int64_t quant_block_axis = 0;
+    int64_t quant_block_size = 0;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(node_unit.Outputs()[0], is_block_quant, quant_block_axis, quant_block_size));
+    ORT_RETURN_IF(is_block_quant, "QNN EP does not support a standalone Q op with blockwise quantization");
+
     if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
       ORT_RETURN_IF(qnn_model_wrapper.IsGraphInput(node_unit.Inputs()[0].node_arg.Name()),
                     "QNN EP is configured to not take Q nodes that consume a graph input.");
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -491,6 +491,51 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   return Status::OK();
 }
 
+// Checks if a tensor in the ONNX graph is blockwise quantized.
+Status QnnModelWrapper::IsBlockwiseQuantized(const onnxruntime::NodeUnitIODef& io_def,
+                                             /*out*/ bool& is_blockwise,
+                                             /*out*/ int64_t& axis,
+                                             /*out*/ int64_t& block_size) const {
+  if (!io_def.quant_param) {
+    is_blockwise = false;
+    return Status::OK();
+  }
+
+  const std::string& scale_name = io_def.quant_param->scale.Name();
+  const auto& graph_initializers = GetInitializerTensors();
+  auto iter = graph_initializers.find(scale_name);
+  ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
+                scale_name.c_str());
+  gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
+  TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
+
+  const auto* tensor_shape_proto = io_def.node_arg.Shape();
+  ORT_RETURN_IF_NOT(tensor_shape_proto != nullptr, "NULL tensor shape proto");
+  const int rank = tensor_shape_proto->dim_size();
+
+  // Check the number of scale values to determine if the tensor is blockwise.
+  // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis and a block_size
+  // because even a blocked DQ/Q op may not have an explicit axis or block_size attribute.
+  // (axis assumed to default to 1 if missing, and block_size assumed to default to 0 if missing)
+  const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
+                                          (scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);
+
+  is_blockwise = !is_scalar_or_1_elem_vector && (scale_shape.NumDimensions() == rank);
+
+  if (is_blockwise) {
+    axis = io_def.quant_param->axis.value_or(1);  // 1 is default axis for Q/DQ ops.
+    if (axis < 0) {
+      // Normalize negative axis by adding rank.
+      ORT_RETURN_IF_NOT(rank > 0, "Blockwise quantized tensor should be of rank > 0");
+
+      axis += rank;
+    }
+    block_size = io_def.quant_param->block_size.value_or(0);  // 0 is default block_size for Q/DQ ops.
+  }
+
+  return Status::OK();
+}
+
 Status QnnModelWrapper::GetTensorInfo(const NodeUnitIODef& input, TensorInfo& tensor_info) const {
   const std::string& name = input.node_arg.Name();
 
@@ -546,9 +591,10 @@ Status QnnModelWrapper::AddReshapeNode(const std::string& input_name, const std:
                                        const Qnn_DataType_t& tensor_data_type,
                                        const QnnQuantParamsWrapper& quantize_param, bool do_op_validation,
                                        bool is_for_input, bool is_for_output) {
-  // Do not allow QNN EP to insert Reshape nodes with per-channel quantization on dynamic tensors
+  // Do not allow QNN EP to insert Reshape nodes with per-channel or blockwise quantization on dynamic tensors
   // if only one quantization param is provided.
-  ORT_RETURN_IF(quantize_param.IsPerChannel(), "Do not support inserted Reshape nodes with per-channel quantization");
+  ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
+                "Do not support inserted Reshape nodes with per-channel or blockwise quantization");
   return AddReshapeNode(input_name, output_name, input_shape, output_shape, tensor_data_type, quantize_param,
                         quantize_param, do_op_validation, is_for_input, is_for_output);
 }
@@ -564,11 +610,11 @@ Status QnnModelWrapper::AddTransposeNode(NodeIndex node_index,
                                          bool do_op_validation,
                                          bool is_for_input,
                                          bool is_for_output) {
-  // Do not allow QNN EP to insert transpose nodes with per-channel quantization on dynamic tensors.
+  // Do not allow QNN EP to insert transpose nodes with per-channel or blockwise quantization on dynamic tensors.
   // We could technically support this by transposing the quantization param's axis value, but
   // we don't need this right now.
-  ORT_RETURN_IF(quantize_param.IsPerChannel(),
-                "Do not support inserted Transpose nodes with per-channel quantization");
+  ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
+                "Do not support inserted Transpose nodes with per-channel or blockwise quantization");
   // No need to add this for output nodes as it is added as output tensor for previous node
   if (is_for_input) {
     Qnn_TensorType_t tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -272,6 +272,12 @@ class QnnModelWrapper {
                                /*out*/ bool& is_per_channel,
                                /*out*/ int64_t& axis) const;
 
+  // Checks if a tensor in the ONNX graph is blockwise quantized.
+  Status IsBlockwiseQuantized(const onnxruntime::NodeUnitIODef& io_def,
+                              /*out*/ bool& is_blockwise,
+                              /*out*/ int64_t& axis,
+                              /*out*/ int64_t& block_size) const;
+
  private:
   bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
                                    const std::vector<std::string>& names,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h