Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,10 @@ static const char* const kOrtSessionOptionsFailOnSuboptimalCompiledModel =
// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
// "sustained_high_performance". Default to "default".
static const char* const kOrtEpDynamicOptionsQnnHtpPerformanceMode = "ep.dynamic.qnn_htp_performance_mode";

// Enable QNN HTP batch multiplier
//
// Option values:
// - "0": QNN htp batch multiplier is disabled. [DEFAULT]
// - "1": QNN htp batch multiplier is enabaled.
static const char* const kOrtSessionOptionsQnnHtpBatchMultiplier = "ep.qnn.enable_htp_batch_multiplier";
142 changes: 133 additions & 9 deletions onnxruntime/core/providers/qnn/builder/qnn_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,46 @@ Status QnnModel::FinalizeGraphs(const logging::Logger& logger) {
return Status::OK();
}

// Helper function to check if ORT tensor shape matches expected QNN tensor shape (excluding batch dimension at index 0)
template <typename OrtValueType>
static Status CheckShape(const OrtValueType& ort_tensor,
const QnnTensorInfo& qnn_io_info) {
const auto input_output_shape = ort_tensor.GetTensorTypeAndShapeInfo().GetShape();
const auto shape_size = input_output_shape.size();
const auto& expected_shape = qnn_io_info.ori_dimensions_;
const auto expected_shape_size = expected_shape.size();
const auto& tensor_name = qnn_io_info.tensor_wrapper->GetName();

ORT_RETURN_IF_NOT(shape_size == expected_shape_size,
"Invalid rank for tensor: ", tensor_name,
" Got: ", shape_size, " Expected: ", expected_shape_size,
" Please fix either the inputs/outputs or the model.");

// Collect all invalid dimension indices (skip batch dimension at index 0)
InlinedVector<size_t> invalid_dim_indices;
invalid_dim_indices.reserve(shape_size);
for (size_t i = 1; i < shape_size; ++i) {
if (input_output_shape[i] != static_cast<int64_t>(expected_shape[i])) {
invalid_dim_indices.push_back(i);
}
}

if (!invalid_dim_indices.empty()) {
std::ostringstream ostr;
ostr << "Got invalid dimensions for tensor: " << tensor_name
<< " for the following indices (excluding batch dimension)\n";
for (const auto idx : invalid_dim_indices) {
ostr << " index: " << idx
<< " Got: " << input_output_shape[idx]
<< " Expected: " << expected_shape[idx] << "\n";
}
ostr << " Please fix either the inputs/outputs or the model.";
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, ostr.str());
}

return Status::OK();
}

Status QnnModel::SetupQnnInputOutput(const logging::Logger& logger) {
LOGS(logger, VERBOSE) << "Setting up QNN input/output for graph: " << graph_info_->Name();

Expand Down Expand Up @@ -239,56 +279,139 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context,
return element_size * length;
};

constexpr size_t BATCH_DIMENSION_INDEX = 0;
uint32_t batch_multiplier = static_cast<uint32_t>(
TensorDataSize(context.GetInput(qnn_input_infos_[0].ort_index)) / qnn_input_infos_[0].tensor_byte_size);
auto backend_type = qnn_backend_manager_->GetQnnBackendType();
// Check if batch multiplier is used, and it's only supported on the HTP backend.
ORT_RETURN_IF(batch_multiplier != 1 && !IsNpuBackend(backend_type),
"Batch multiplier is only supported on HTP backend, but current backend is: ",
static_cast<int>(backend_type));

// ===== Phase 1: Prepare inputs (thread-safe, no lock needed) =====
std::vector<Qnn_Tensor_t> qnn_inputs;
qnn_inputs.reserve(qnn_input_infos_.size());

// The dimensions field in Qnn_Tensor_t is a pointer, so qnn_inputs.push_back() performs a shallow copy.
// Multiple threads would share the same dimensions array, leading to race conditions when directly modifying batch size.
// To ensure thread safety, we create independent dimension copies for each thread when batch multiplier > 1.
// These copies are stored in dimensions_copies to maintain their lifetime throughout execution.
std::vector<std::vector<uint32_t>> input_dimensions_copies;
if (batch_multiplier > 1) {
input_dimensions_copies.reserve(qnn_input_infos_.size());
}

for (const auto& qnn_input_info : qnn_input_infos_) {
LOGS(logger, VERBOSE) << "model_input = " << qnn_input_info.tensor_wrapper->GetName()
<< " index = " << qnn_input_info.ort_index;
auto ort_input_tensor = context.GetInput(qnn_input_info.ort_index);
auto ort_tensor_size = TensorDataSize(ort_input_tensor);
LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_input_info.tensor_byte_size
<< " Ort tensor size: " << ort_tensor_size;
ORT_RETURN_IF_NOT(qnn_input_info.tensor_byte_size == ort_tensor_size,
"ORT Tensor data size does not match QNN tensor data size.");

LOGS(logger, VERBOSE) << "Original Qnn input tensor shape: " << qnn_input_info.ori_dimensions_;
LOGS(logger, VERBOSE) << "Qnn input tensor shape: " << qnn_input_info.tensor_wrapper->GetTensorDims();
LOGS(logger, VERBOSE) << "Ort input tensor shape: " << ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape();
ORT_RETURN_IF_NOT(ort_tensor_size % qnn_input_info.tensor_byte_size == 0,
"ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (",
qnn_input_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ",
"Expected: exact match or integer multiple.");

// check if there is only one batch multiplier value.
uint32_t bm = static_cast<uint32_t>(ort_tensor_size / qnn_input_info.tensor_byte_size);
ORT_RETURN_IF_NOT(bm == batch_multiplier,
"Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm);

// Get QNN tensor (shallow copy)
qnn_inputs.push_back(qnn_input_info.tensor_wrapper->GetQnnTensor());

// Modify batch dimensions
if (batch_multiplier > 1) {
// Check dimensions except for batch dimension (index 0)
ORT_RETURN_IF_ERROR(CheckShape(ort_input_tensor, qnn_input_info));

// Create independent dimensions copy to avoid race conditions
std::vector<uint32_t> dims_copy = qnn_input_info.ori_dimensions_;
dims_copy[BATCH_DIMENSION_INDEX] =
static_cast<uint32_t>(ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]);
LOGS(logger, VERBOSE) << "qnn_inputs batch size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX];
input_dimensions_copies.push_back(std::move(dims_copy));
// Point QNN tensor to the independent dimensions copy
SetQnnTensorDim(qnn_inputs.back(), input_dimensions_copies.back());
}

ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory(
logger,
*qnn_backend_manager_,
*static_cast<const OrtMemoryInfo*>(ort_input_tensor.GetTensorMemoryInfo()),
const_cast<void*>(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size,
const_cast<void*>(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size * batch_multiplier,
graph_info_->GraphContext(),
qnn_inputs.back()));
}

// ===== Phase 2: Prepare outputs (thread-safe, no lock needed) =====
std::vector<Qnn_Tensor_t> qnn_outputs;
qnn_outputs.reserve(qnn_output_infos_.size());

// Create independent copies of dimensions for outputs
std::vector<std::vector<uint32_t>> output_dimensions_copies;
if (batch_multiplier > 1) {
output_dimensions_copies.reserve(qnn_output_infos_.size());
}

for (auto& qnn_output_info : qnn_output_infos_) {
const std::string& model_output_name = qnn_output_info.tensor_wrapper->GetName();
LOGS(logger, VERBOSE) << "model_output = " << model_output_name << " index = " << qnn_output_info.ort_index;
const auto& ort_output_info = GetOutputInfo(model_output_name);
const std::vector<int64_t>& output_shape = ort_output_info->shape_;
// Adjust output shape to match input batch size
std::vector<int64_t> output_shape = ort_output_info->shape_;
if (batch_multiplier > 1) {
output_shape[BATCH_DIMENSION_INDEX] *= batch_multiplier;
LOGS(logger, VERBOSE) << "batch multiplier triggered: " << batch_multiplier;
LOGS(logger, VERBOSE) << "Modify ORT output batch size to : " << output_shape[BATCH_DIMENSION_INDEX];
}
auto ort_output_tensor = context.GetOutput(qnn_output_info.ort_index, output_shape.data(), output_shape.size());
auto ort_tensor_size = TensorDataSize(ort_output_tensor);
LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_output_info.tensor_byte_size
<< " Ort tensor size: " << ort_tensor_size;
ORT_RETURN_IF_NOT(qnn_output_info.tensor_byte_size == ort_tensor_size,
"ORT Tensor data size does not match QNN tensor data size");

LOGS(logger, VERBOSE) << "Original Qnn output tensor shape: " << qnn_output_info.ori_dimensions_;
LOGS(logger, VERBOSE) << "ORT output tensor shape: " << ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape();
LOGS(logger, VERBOSE) << "Qnn output tensor shape: " << qnn_output_info.tensor_wrapper->GetTensorDims();
ORT_RETURN_IF_NOT(ort_tensor_size % qnn_output_info.tensor_byte_size == 0,
"ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (",
qnn_output_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ",
"Expected: exact match or integer multiple.");
uint32_t bm = static_cast<uint32_t>(ort_tensor_size / qnn_output_info.tensor_byte_size);
ORT_RETURN_IF_NOT(bm == batch_multiplier,
"Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm);

// Get QNN tensor (shallow copy)
qnn_outputs.push_back(qnn_output_info.tensor_wrapper->GetQnnTensor());

// Modify batch dimensions
if (batch_multiplier > 1) {
// Check dimensions except for batch dimension (index 0)
ORT_RETURN_IF_ERROR(CheckShape(ort_output_tensor, qnn_output_info));

// Create independent dimensions copy to avoid race conditions
std::vector<uint32_t> dims_copy = qnn_output_info.ori_dimensions_;
dims_copy[BATCH_DIMENSION_INDEX] =
static_cast<uint32_t>(ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]);
LOGS(logger, VERBOSE) << "qnn_outputs batch_size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX];
output_dimensions_copies.push_back(std::move(dims_copy));
// Point QNN tensor to the independent dimensions copy
SetQnnTensorDim(qnn_outputs.back(), output_dimensions_copies.back());
}

ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory(
logger,
*qnn_backend_manager_,
*static_cast<const OrtMemoryInfo*>(ort_output_tensor.GetTensorMemoryInfo()),
ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size,
ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size * batch_multiplier,
graph_info_->GraphContext(),
qnn_outputs.back()));
}

// ===== Phase 3: Execute graph (requires mutex lock) =====
Qnn_ErrorHandle_t execute_status = QNN_GRAPH_NO_ERROR;
{
const auto& qnn_interface = qnn_backend_manager_->GetQnnInterface();
Expand Down Expand Up @@ -355,6 +478,7 @@ Status QnnModel::SetupTensors(std::vector<QnnTensorInfo>& qnn_tensor_infos,
qnn_tensor_info.tensor_wrapper = &tensor_wrapper;
qnn_tensor_info.tensor_byte_size = static_cast<uint32_t>(length);
qnn_tensor_info.ort_index = ort_index;
qnn_tensor_info.ori_dimensions_.assign(tensor_wrapper.GetTensorDims().begin(), tensor_wrapper.GetTensorDims().end());
}
// The number of graph inputs and the number of tensor wrappers may not match.
// - For example, for ResizeNearestNeighbor op, Qnn only cares about the 1st input,
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct QnnTensorInfo {
const QnnTensorWrapper* tensor_wrapper = nullptr;
uint32_t tensor_byte_size = 0;
size_t ort_index = 0;
std::vector<uint32_t> ori_dimensions_;
};

class QnnModel {
Expand Down
8 changes: 8 additions & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,14 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
*/
Status GetPermToLastAxis(uint32_t axis, uint32_t rank, std::vector<uint32_t>& perm);

template <typename T>
inline std::ostream& operator<<(std::ostream& out, const std::vector<T>& vec) {
for (const auto& elem : vec) {
out << elem << " ";
}
return out;
}

} // namespace utils
} // namespace qnn
} // namespace onnxruntime
17 changes: 17 additions & 0 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";

qnn_htp_batch_multiplier_ = config_options->GetConfigOrDefault(
kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1";

context_cache_enabled_ = config_options->GetConfigOrDefault(
kOrtSessionOptionEpContextEnable, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
Expand Down Expand Up @@ -961,6 +964,20 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
return result;
}

// Report error if QNN HTP batch multiplier is used while do not disable cpu ep fallback.
// Since QNN HTP batch multiplier only supports whole graph running on HTP backend.
if (qnn_htp_batch_multiplier_) {
if (!disable_cpu_ep_fallback_) {
LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while CPU fallback is not disabled.\n Please add session options 'session.disable_cpu_ep_fallback|1'";
return result;
}
const auto& backend_type = qnn_backend_manager_->GetQnnBackendType();
if (backend_type != qnn::QnnBackendType::HTP && backend_type != qnn::QnnBackendType::HTP_FP16) {
LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while not using HTP backend.\n Please use HTP backend type";
return result;
}
}

if ((context_cache_enabled_ || is_qnn_ctx_model) && !IsQpuBackend(qnn_backend_manager_->GetQnnBackendType())) {
LOGS(logger, ERROR) << "Qnn context cache only works for HTP/DSP/GPU backend.";
return result;
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/qnn/qnn_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ class QNNExecutionProvider : public IExecutionProvider {
bool context_cache_enabled_ = false;
std::string context_cache_path_cfg_ = "";
std::string context_node_name_prefix_ = "";
bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session.
bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session.
bool qnn_htp_batch_multiplier_ = false; // True if qnn htp batch multiplier options is enabled for this session.
bool qnn_context_embed_mode_ = true;
int32_t vtcm_size_in_mb_ = 0;
bool enable_vtcm_backup_buffer_sharing_ = false;
Expand Down
11 changes: 9 additions & 2 deletions onnxruntime/core/session/inference_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2981,10 +2981,17 @@ Status InferenceSession::Run(const RunOptions& run_options,

// log evaluation start to trace logging provider
env.GetTelemetryProvider().LogEvaluationStart(session_id_);

#ifdef USE_QNN
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering that we're moving towards plugin EPs, we should avoid EP-specific code in the core onnxruntime library. Otherwise, we would need a special build of onnxruntime.dll that works with the plugin QNN EP.

const bool batch_multiplier = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1";
if (!batch_multiplier) {
LOGS(*session_logger_, INFO) << "Enable QNN HTP batch mutliplier. Don't validate the inputs/outptus";
ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds));
ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches));
}
#else
ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds));
ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches));

#endif
// shrink certain default memory arenas if the user has requested for it
const std::string& shrink_memory_arenas =
run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigEnableMemoryArenaShrinkage, "");
Expand Down
Loading