Skip to content

Commit d8b7d5c

Browse files
committed
[QNN-EP] Add BQ and LPBQ support in QNN EP
* Description - Add a new attribute block_size to QuantParam. - Add a function to check if a tensor in the ONNX graph is blockwise quantized in QnnModelWrapper. - Add definition and translation of BQ (Blockwise Quantization) and LPBQ (Low Power Blockwise Quantization) when initializing QnnQuantParamsWrapper. - Add checks for blockwise quantization in several op builders. * Motivation and Context - QNN-EP currently does not support block quantization coding. Since the finer granularity provided by block quantization generally leads to better quantization accuracy, we'd like to add support for BQ and LPBQ in QNN-EP.
1 parent fe7634e commit d8b7d5c

File tree

12 files changed

+398
-19
lines changed

12 files changed

+398
-19
lines changed

onnxruntime/core/framework/node_unit.cc

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,14 @@ std::vector<NodeUnitIODef> GetQDQIODefs(const Node& target_node, const QDQ::Node
118118
axis = entry->second.i();
119119
}
120120

121+
// Get the Q or DQ block_size attribute if available.
122+
std::optional<int64_t> block_size;
123+
if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
124+
block_size = entry->second.i();
125+
}
126+
121127
// quantization scale and zp are always the input[1, 2]
122-
NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr, axis};
128+
NodeUnitIODef::QuantParam quant_param{*node_inputs[1], node_inputs.size() == 3 ? node_inputs[2] : nullptr, axis, block_size};
123129

124130
if (is_input) {
125131
// DQ is input to the target node, use the DstArgIndex
@@ -373,10 +379,17 @@ void NodeUnit::InitForSingleNode() {
373379
axis = entry->second.i();
374380
}
375381

382+
// Get the DQ block_size attribute if available.
383+
std::optional<int64_t> block_size;
384+
if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
385+
block_size = entry->second.i();
386+
}
387+
376388
inputs_.push_back(NodeUnitIODef{*input_defs[0],
377389
NodeUnitIODef::QuantParam{*input_defs[1],
378390
input_defs.size() == 3 ? input_defs[2] : nullptr,
379-
axis}});
391+
axis,
392+
block_size}});
380393
outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
381394

382395
} else if (qlinear_type == QLinearOpType::QuantizeLinear) {
@@ -390,11 +403,18 @@ void NodeUnit::InitForSingleNode() {
390403
axis = entry->second.i();
391404
}
392405

406+
// Get the Q block_size attribute if available.
407+
std::optional<int64_t> block_size;
408+
if (auto entry = node_attrs.find("block_size"); entry != node_attrs.end()) {
409+
block_size = entry->second.i();
410+
}
411+
393412
inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
394413
outputs_.push_back(NodeUnitIODef{*output_defs[0],
395414
NodeUnitIODef::QuantParam{*input_defs[1],
396415
input_defs.size() == 3 ? input_defs[2] : nullptr,
397-
axis}});
416+
axis,
417+
block_size}});
398418
} else if (IsVariadicQLinearOp(qlinear_type)) {
399419
size_t input_num = (input_defs.size() - 2) / 3;
400420
for (size_t i = 0; i < input_num; i++) {

onnxruntime/core/framework/node_unit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ struct NodeUnitIODef {
4949
const NodeArg& scale;
5050
const NodeArg* zero_point{nullptr};
5151
std::optional<int64_t> axis{std::nullopt};
52+
std::optional<int64_t> block_size{std::nullopt};
5253
};
5354

5455
const NodeArg& node_arg;

onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
111111
}
112112
}
113113

114-
// Validate that weight is signed type for per-channel quantization (required by QNN docs).
114+
// Validate that weight is signed type for per-channel and blockwise quantization (required by QNN docs).
115115
if (is_npu_backend) {
116116
const auto& input_1 = inputs[1]; // weight
117117
bool is_per_axis_quant = false;
@@ -134,6 +134,21 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
134134
ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization");
135135
}
136136
}
137+
138+
bool is_block_quant = false;
139+
int64_t quant_block_axis = 0;
140+
int64_t quant_block_size = 0;
141+
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(input_1, is_block_quant, quant_block_axis, quant_block_size));
142+
143+
if (is_block_quant) {
144+
int32_t elem_data_type = 0;
145+
ORT_RETURN_IF_ERROR(utils::GetOnnxTensorElemDataType(input_1.node_arg, elem_data_type));
146+
147+
const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) ||
148+
(elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
149+
(elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
150+
ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized blockwise");
151+
}
137152
}
138153

139154
return Status::OK();
@@ -237,10 +252,17 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
237252
std::vector<size_t> perm_inv(perm.size());
238253
ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
239254
ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
255+
} else if (input_info.quant_param.IsLPBQ()) { // Transpose quantization parameter's axis if this is using LPBQ quantization.
256+
// Only Conv2d supports LPBQ.
257+
ORT_RETURN_IF((conv_type != OnnxConvType::kConv) || is_3d, "Apply LPBQ only on Conv2d");
258+
const std::vector<size_t> perm = nchw2hwcn_perm;
259+
std::vector<size_t> perm_inv(perm.size());
260+
ORT_RETURN_IF_ERROR(utils::InvertPerm<size_t>(perm, perm_inv));
261+
ORT_RETURN_IF_ERROR(input_info.quant_param.HandleTranspose<size_t>(perm_inv));
240262
}
241263
} else {
242264
// Add transpose node above weight input.
243-
ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
265+
ORT_RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsBlockwise(),
244266
"Non-constant Conv inputs only support per-tensor quantization");
245267
bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);
246268
LOGS(logger, VERBOSE) << "Add HWCN Transpose node after input: " << input1_name;
@@ -350,7 +372,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
350372
};
351373

352374
if (!input0_info.is_initializer) {
353-
ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
375+
ORT_RETURN_IF(input0_info.quant_param.IsPerChannel() || input0_info.quant_param.IsBlockwise(),
354376
"Non-constant Conv inputs only support per-tensor quantization");
355377

356378
// Add Reshape node to transform 1D input to 2D (i.e., set height to 1).
@@ -465,7 +487,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
465487
}
466488
} else {
467489
// Dynamic weight: Add nodes to reshape to 2D, and then transpose.
468-
ORT_RETURN_IF(input_info.quant_param.IsPerChannel(),
490+
ORT_RETURN_IF(input_info.quant_param.IsPerChannel() || input_info.quant_param.IsBlockwise(),
469491
"Non-constant Conv inputs only support per-tensor quantization");
470492

471493
bool is_graph_input = qnn_model_wrapper.IsGraphInput(input1_name);

onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
118118

119119
std::string input_tensor_name = input_name;
120120
if (1 == input_trans_flag.at(input_i) && !is_constant_input) {
121-
ORT_RETURN_IF(quantize_param.IsPerChannel(), "Non-constant Gemm inputs only support per-tensor quantization");
121+
ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
122+
"Non-constant Gemm inputs only support per-tensor quantization");
122123

123124
// Add Transpose node
124125
std::vector<uint32_t> old_input_shape(input_shape);

onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Status InstanceNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
114114
};
115115

116116
if (!input0_info.is_initializer) {
117-
ORT_RETURN_IF(input0_info.quant_param.IsPerChannel(),
117+
ORT_RETURN_IF(input0_info.quant_param.IsPerChannel() || input0_info.quant_param.IsBlockwise(),
118118
"Non-constant InstanceNormalization inputs only support per-tensor quantization");
119119

120120
// Add Reshape node to transform 1D input to 2D (i.e., set height to 1).

onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
6969
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
7070
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");
7171

72+
bool is_block_quant = false;
73+
int64_t quant_block_axis = 0;
74+
int64_t quant_block_size = 0;
75+
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(node_unit.Inputs()[0], is_block_quant, quant_block_axis, quant_block_size));
76+
ORT_RETURN_IF(is_block_quant, "QNN EP does not support a standalone DQ op with blockwise quantization");
77+
7278
if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
7379
ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(node_unit.Outputs()[0].node_arg.Name()),
7480
"QNN EP is configured to not take DQ nodes that generate a graph output.");
@@ -81,6 +87,12 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
8187
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
8288
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");
8389

90+
bool is_block_quant = false;
91+
int64_t quant_block_axis = 0;
92+
int64_t quant_block_size = 0;
93+
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsBlockwiseQuantized(node_unit.Outputs()[0], is_block_quant, quant_block_axis, quant_block_size));
94+
ORT_RETURN_IF(is_block_quant, "QNN EP does not support a standalone Q op with blockwise quantization");
95+
8496
if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
8597
ORT_RETURN_IF(qnn_model_wrapper.IsGraphInput(node_unit.Inputs()[0].node_arg.Name()),
8698
"QNN EP is configured to not take Q nodes that consume a graph input.");

onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,51 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
491491
return Status::OK();
492492
}
493493

494+
// Checks if a tensor in the ONNX graph is blockwise quantized.
495+
Status QnnModelWrapper::IsBlockwiseQuantized(const onnxruntime::NodeUnitIODef& io_def,
496+
/*out*/ bool& is_blockwise,
497+
/*out*/ int64_t& axis,
498+
/*out*/ int64_t& block_size) const {
499+
if (!io_def.quant_param) {
500+
is_blockwise = false;
501+
return Status::OK();
502+
}
503+
504+
const std::string& scale_name = io_def.quant_param->scale.Name();
505+
const auto& graph_initializers = GetInitializerTensors();
506+
auto iter = graph_initializers.find(scale_name);
507+
ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
508+
scale_name.c_str());
509+
gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
510+
TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
511+
512+
const auto* tensor_shape_proto = io_def.node_arg.Shape();
513+
ORT_RETURN_IF_NOT(tensor_shape_proto != nullptr, "NULL tensor shape proto");
514+
const int rank = tensor_shape_proto->dim_size();
515+
516+
// Check the number of scale values to determine if the tensor is blockwise.
517+
// This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis and a block_size
518+
// because even a blocked DQ/Q op may not have an explicit axis or block_size attribute.
519+
// (axis assumed to default to 1 if missing, and block_size assumed to default to 0 if missing)
520+
const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
521+
(scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);
522+
523+
is_blockwise = !is_scalar_or_1_elem_vector && (scale_shape.NumDimensions() == rank);
524+
525+
if (is_blockwise) {
526+
axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops.
527+
if (axis < 0) {
528+
// Normalize negative axis by adding rank.
529+
ORT_RETURN_IF_NOT(rank > 0, "Blockwise quantized tensor should be of rank > 0");
530+
531+
axis += rank;
532+
}
533+
block_size = io_def.quant_param->block_size.value_or(0); // 0 is default block_size for Q/DQ ops.
534+
}
535+
536+
return Status::OK();
537+
}
538+
494539
Status QnnModelWrapper::GetTensorInfo(const NodeUnitIODef& input, TensorInfo& tensor_info) const {
495540
const std::string& name = input.node_arg.Name();
496541

@@ -546,9 +591,10 @@ Status QnnModelWrapper::AddReshapeNode(const std::string& input_name, const std:
546591
const Qnn_DataType_t& tensor_data_type,
547592
const QnnQuantParamsWrapper& quantize_param, bool do_op_validation,
548593
bool is_for_input, bool is_for_output) {
549-
// Do not allow QNN EP to insert Reshape nodes with per-channel quantization on dynamic tensors
594+
// Do not allow QNN EP to insert Reshape nodes with per-channel or blockwise quantization on dynamic tensors
550595
// if only one quantization param is provided.
551-
ORT_RETURN_IF(quantize_param.IsPerChannel(), "Do not support inserted Reshape nodes with per-channel quantization");
596+
ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
597+
"Do not support inserted Reshape nodes with per-channel or blockwise quantization");
552598
return AddReshapeNode(input_name, output_name, input_shape, output_shape, tensor_data_type, quantize_param,
553599
quantize_param, do_op_validation, is_for_input, is_for_output);
554600
}
@@ -564,11 +610,11 @@ Status QnnModelWrapper::AddTransposeNode(NodeIndex node_index,
564610
bool do_op_validation,
565611
bool is_for_input,
566612
bool is_for_output) {
567-
// Do not allow QNN EP to insert transpose nodes with per-channel quantization on dynamic tensors.
613+
// Do not allow QNN EP to insert transpose nodes with per-channel or blockwise quantization on dynamic tensors.
568614
// We could technically support this by transposing the quantization param's axis value, but
569615
// we don't need this right now.
570-
ORT_RETURN_IF(quantize_param.IsPerChannel(),
571-
"Do not support inserted Transpose nodes with per-channel quantization");
616+
ORT_RETURN_IF(quantize_param.IsPerChannel() || quantize_param.IsBlockwise(),
617+
"Do not support inserted Transpose nodes with per-channel or blockwise quantization");
572618
// No need to add this for output nodes as it is added as output tensor for previous node
573619
if (is_for_input) {
574620
Qnn_TensorType_t tensor_type = QNN_TENSOR_TYPE_APP_WRITE;

onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,12 @@ class QnnModelWrapper {
272272
/*out*/ bool& is_per_channel,
273273
/*out*/ int64_t& axis) const;
274274

275+
// Checks if a tensor in the ONNX graph is blockwise quantized.
276+
Status IsBlockwiseQuantized(const onnxruntime::NodeUnitIODef& io_def,
277+
/*out*/ bool& is_blockwise,
278+
/*out*/ int64_t& axis,
279+
/*out*/ int64_t& block_size) const;
280+
275281
private:
276282
bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
277283
const std::vector<std::string>& names,

0 commit comments

Comments
 (0)