diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 64a434e2fe301..0c1f684615da8 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -415,3 +415,10 @@ static const char* const kOrtSessionOptionsFailOnSuboptimalCompiledModel = // "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver", // "sustained_high_performance". Default to "default". static const char* const kOrtEpDynamicOptionsQnnHtpPerformanceMode = "ep.dynamic.qnn_htp_performance_mode"; + +// Enable QNN HTP batch multiplier +// +// Option values: +// - "0": QNN htp batch multiplier is disabled. [DEFAULT] +// - "1": QNN htp batch multiplier is enabaled. +static const char* const kOrtSessionOptionsQnnHtpBatchMultiplier = "ep.qnn.enable_htp_batch_multiplier"; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index 175a76b590895..30422e8654e26 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -174,6 +174,46 @@ Status QnnModel::FinalizeGraphs(const logging::Logger& logger) { return Status::OK(); } +// Helper function to check if ORT tensor shape matches expected QNN tensor shape (excluding batch dimension at index 0) +template +static Status CheckShape(const OrtValueType& ort_tensor, + const QnnTensorInfo& qnn_io_info) { + const auto input_output_shape = ort_tensor.GetTensorTypeAndShapeInfo().GetShape(); + const auto shape_size = input_output_shape.size(); + const auto& expected_shape = qnn_io_info.ori_dimensions_; + const auto expected_shape_size = expected_shape.size(); + const auto& tensor_name = qnn_io_info.tensor_wrapper->GetName(); + + ORT_RETURN_IF_NOT(shape_size == expected_shape_size, + "Invalid rank for tensor: ", tensor_name, + " Got: ", shape_size, " Expected: ", expected_shape_size, + " Please fix either the inputs/outputs or the model."); + + // Collect all invalid dimension indices (skip batch dimension at index 0) + InlinedVector invalid_dim_indices; + invalid_dim_indices.reserve(shape_size); + for (size_t i = 1; i < shape_size; ++i) { + if (input_output_shape[i] != static_cast(expected_shape[i])) { + invalid_dim_indices.push_back(i); + } + } + + if (!invalid_dim_indices.empty()) { + std::ostringstream ostr; + ostr << "Got invalid dimensions for tensor: " << tensor_name + << " for the following indices (excluding batch dimension)\n"; + for (const auto idx : invalid_dim_indices) { + ostr << " index: " << idx + << " Got: " << input_output_shape[idx] + << " Expected: " << expected_shape[idx] << "\n"; + } + ostr << " Please fix either the inputs/outputs or the model."; + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, ostr.str()); + } + + return Status::OK(); +} + Status QnnModel::SetupQnnInputOutput(const logging::Logger& logger) { LOGS(logger, VERBOSE) << "Setting up QNN input/output for graph: " << graph_info_->Name(); @@ -239,9 +279,28 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context, return element_size * length; }; + constexpr size_t BATCH_DIMENSION_INDEX = 0; + uint32_t batch_multiplier = static_cast( + TensorDataSize(context.GetInput(qnn_input_infos_[0].ort_index)) / qnn_input_infos_[0].tensor_byte_size); + auto backend_type = qnn_backend_manager_->GetQnnBackendType(); + // Check if batch multiplier is used, and it's only supported on the HTP backend. + ORT_RETURN_IF(batch_multiplier != 1 && !IsNpuBackend(backend_type), + "Batch multiplier is only supported on HTP backend, but current backend is: ", + static_cast(backend_type)); + + // ===== Phase 1: Prepare inputs (thread-safe, no lock needed) ===== std::vector qnn_inputs; qnn_inputs.reserve(qnn_input_infos_.size()); + // The dimensions field in Qnn_Tensor_t is a pointer, so qnn_inputs.push_back() performs a shallow copy. + // Multiple threads would share the same dimensions array, leading to race conditions when directly modifying batch size. + // To ensure thread safety, we create independent dimension copies for each thread when batch multiplier > 1. + // These copies are stored in dimensions_copies to maintain their lifetime throughout execution. + std::vector> input_dimensions_copies; + if (batch_multiplier > 1) { + input_dimensions_copies.reserve(qnn_input_infos_.size()); + } + for (const auto& qnn_input_info : qnn_input_infos_) { LOGS(logger, VERBOSE) << "model_input = " << qnn_input_info.tensor_wrapper->GetName() << " index = " << qnn_input_info.ort_index; @@ -249,46 +308,110 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context, auto ort_tensor_size = TensorDataSize(ort_input_tensor); LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_input_info.tensor_byte_size << " Ort tensor size: " << ort_tensor_size; - ORT_RETURN_IF_NOT(qnn_input_info.tensor_byte_size == ort_tensor_size, - "ORT Tensor data size does not match QNN tensor data size."); - + LOGS(logger, VERBOSE) << "Original Qnn input tensor shape: " << qnn_input_info.ori_dimensions_; + LOGS(logger, VERBOSE) << "Qnn input tensor shape: " << qnn_input_info.tensor_wrapper->GetTensorDims(); + LOGS(logger, VERBOSE) << "Ort input tensor shape: " << ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape(); + ORT_RETURN_IF_NOT(ort_tensor_size % qnn_input_info.tensor_byte_size == 0, + "ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (", + qnn_input_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ", + "Expected: exact match or integer multiple."); + + // check if there is only one batch multiplier value. + uint32_t bm = static_cast(ort_tensor_size / qnn_input_info.tensor_byte_size); + ORT_RETURN_IF_NOT(bm == batch_multiplier, + "Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm); + + // Get QNN tensor (shallow copy) qnn_inputs.push_back(qnn_input_info.tensor_wrapper->GetQnnTensor()); + // Modify batch dimensions + if (batch_multiplier > 1) { + // Check dimensions except for batch dimension (index 0) + ORT_RETURN_IF_ERROR(CheckShape(ort_input_tensor, qnn_input_info)); + + // Create independent dimensions copy to avoid race conditions + std::vector dims_copy = qnn_input_info.ori_dimensions_; + dims_copy[BATCH_DIMENSION_INDEX] = + static_cast(ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]); + LOGS(logger, VERBOSE) << "qnn_inputs batch size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX]; + input_dimensions_copies.push_back(std::move(dims_copy)); + // Point QNN tensor to the independent dimensions copy + SetQnnTensorDim(qnn_inputs.back(), input_dimensions_copies.back()); + } + ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory( logger, *qnn_backend_manager_, *static_cast(ort_input_tensor.GetTensorMemoryInfo()), - const_cast(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size, + const_cast(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size * batch_multiplier, graph_info_->GraphContext(), qnn_inputs.back())); } + // ===== Phase 2: Prepare outputs (thread-safe, no lock needed) ===== std::vector qnn_outputs; qnn_outputs.reserve(qnn_output_infos_.size()); + // Create independent copies of dimensions for outputs + std::vector> output_dimensions_copies; + if (batch_multiplier > 1) { + output_dimensions_copies.reserve(qnn_output_infos_.size()); + } + for (auto& qnn_output_info : qnn_output_infos_) { const std::string& model_output_name = qnn_output_info.tensor_wrapper->GetName(); LOGS(logger, VERBOSE) << "model_output = " << model_output_name << " index = " << qnn_output_info.ort_index; const auto& ort_output_info = GetOutputInfo(model_output_name); - const std::vector& output_shape = ort_output_info->shape_; + // Adjust output shape to match input batch size + std::vector output_shape = ort_output_info->shape_; + if (batch_multiplier > 1) { + output_shape[BATCH_DIMENSION_INDEX] *= batch_multiplier; + LOGS(logger, VERBOSE) << "batch multiplier triggered: " << batch_multiplier; + LOGS(logger, VERBOSE) << "Modify ORT output batch size to : " << output_shape[BATCH_DIMENSION_INDEX]; + } auto ort_output_tensor = context.GetOutput(qnn_output_info.ort_index, output_shape.data(), output_shape.size()); auto ort_tensor_size = TensorDataSize(ort_output_tensor); LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_output_info.tensor_byte_size << " Ort tensor size: " << ort_tensor_size; - ORT_RETURN_IF_NOT(qnn_output_info.tensor_byte_size == ort_tensor_size, - "ORT Tensor data size does not match QNN tensor data size"); - + LOGS(logger, VERBOSE) << "Original Qnn output tensor shape: " << qnn_output_info.ori_dimensions_; + LOGS(logger, VERBOSE) << "ORT output tensor shape: " << ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape(); + LOGS(logger, VERBOSE) << "Qnn output tensor shape: " << qnn_output_info.tensor_wrapper->GetTensorDims(); + ORT_RETURN_IF_NOT(ort_tensor_size % qnn_output_info.tensor_byte_size == 0, + "ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (", + qnn_output_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ", + "Expected: exact match or integer multiple."); + uint32_t bm = static_cast(ort_tensor_size / qnn_output_info.tensor_byte_size); + ORT_RETURN_IF_NOT(bm == batch_multiplier, + "Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm); + + // Get QNN tensor (shallow copy) qnn_outputs.push_back(qnn_output_info.tensor_wrapper->GetQnnTensor()); + // Modify batch dimensions + if (batch_multiplier > 1) { + // Check dimensions except for batch dimension (index 0) + ORT_RETURN_IF_ERROR(CheckShape(ort_output_tensor, qnn_output_info)); + + // Create independent dimensions copy to avoid race conditions + std::vector dims_copy = qnn_output_info.ori_dimensions_; + dims_copy[BATCH_DIMENSION_INDEX] = + static_cast(ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]); + LOGS(logger, VERBOSE) << "qnn_outputs batch_size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX]; + output_dimensions_copies.push_back(std::move(dims_copy)); + // Point QNN tensor to the independent dimensions copy + SetQnnTensorDim(qnn_outputs.back(), output_dimensions_copies.back()); + } + ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory( logger, *qnn_backend_manager_, *static_cast(ort_output_tensor.GetTensorMemoryInfo()), - ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size, + ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size * batch_multiplier, graph_info_->GraphContext(), qnn_outputs.back())); } + // ===== Phase 3: Execute graph (requires mutex lock) ===== Qnn_ErrorHandle_t execute_status = QNN_GRAPH_NO_ERROR; { const auto& qnn_interface = qnn_backend_manager_->GetQnnInterface(); @@ -355,6 +478,7 @@ Status QnnModel::SetupTensors(std::vector& qnn_tensor_infos, qnn_tensor_info.tensor_wrapper = &tensor_wrapper; qnn_tensor_info.tensor_byte_size = static_cast(length); qnn_tensor_info.ort_index = ort_index; + qnn_tensor_info.ori_dimensions_.assign(tensor_wrapper.GetTensorDims().begin(), tensor_wrapper.GetTensorDims().end()); } // The number of graph inputs and the number of tensor wrappers may not match. // - For example, for ResizeNearestNeighbor op, Qnn only cares about the 1st input, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h index 9f10b319f1a57..7ee2cf6785dea 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h @@ -19,6 +19,7 @@ struct QnnTensorInfo { const QnnTensorWrapper* tensor_wrapper = nullptr; uint32_t tensor_byte_size = 0; size_t ort_index = 0; + std::vector ori_dimensions_; }; class QnnModel { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index b234f7df375e9..56f632490af8d 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -454,6 +454,14 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper, */ Status GetPermToLastAxis(uint32_t axis, uint32_t rank, std::vector& perm); +template +inline std::ostream& operator<<(std::ostream& out, const std::vector& vec) { + for (const auto& elem : vec) { + out << elem << " "; + } + return out; +} + } // namespace utils } // namespace qnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 2bdbfb9c1c62e..d1fe115b6eb9b 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -304,6 +304,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault( kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; + qnn_htp_batch_multiplier_ = config_options->GetConfigOrDefault( + kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1"; + context_cache_enabled_ = config_options->GetConfigOrDefault( kOrtSessionOptionEpContextEnable, "0") == "1"; LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_; @@ -961,6 +964,20 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer return result; } + // Report error if QNN HTP batch multiplier is used while do not disable cpu ep fallback. + // Since QNN HTP batch multiplier only supports whole graph running on HTP backend. + if (qnn_htp_batch_multiplier_) { + if (!disable_cpu_ep_fallback_) { + LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while CPU fallback is not disabled.\n Please add session options 'session.disable_cpu_ep_fallback|1'"; + return result; + } + const auto& backend_type = qnn_backend_manager_->GetQnnBackendType(); + if (backend_type != qnn::QnnBackendType::HTP && backend_type != qnn::QnnBackendType::HTP_FP16) { + LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while not using HTP backend.\n Please use HTP backend type"; + return result; + } + } + if ((context_cache_enabled_ || is_qnn_ctx_model) && !IsQpuBackend(qnn_backend_manager_->GetQnnBackendType())) { LOGS(logger, ERROR) << "Qnn context cache only works for HTP/DSP/GPU backend."; return result; diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 6adf613932d66..57047ed563c42 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -90,7 +90,8 @@ class QNNExecutionProvider : public IExecutionProvider { bool context_cache_enabled_ = false; std::string context_cache_path_cfg_ = ""; std::string context_node_name_prefix_ = ""; - bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session. + bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session. + bool qnn_htp_batch_multiplier_ = false; // True if qnn htp batch multiplier options is enabled for this session. bool qnn_context_embed_mode_ = true; int32_t vtcm_size_in_mb_ = 0; bool enable_vtcm_backup_buffer_sharing_ = false; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e3291cdce62c5..fcd44183ab16b 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -2981,10 +2981,17 @@ Status InferenceSession::Run(const RunOptions& run_options, // log evaluation start to trace logging provider env.GetTelemetryProvider().LogEvaluationStart(session_id_); - +#ifdef USE_QNN + const bool batch_multiplier = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1"; + if (!batch_multiplier) { + LOGS(*session_logger_, INFO) << "Enable QNN HTP batch mutliplier. Don't validate the inputs/outptus"; + ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds)); + ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches)); + } +#else ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds)); ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches)); - +#endif // shrink certain default memory arenas if the user has requested for it const std::string& shrink_memory_arenas = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigEnableMemoryArenaShrinkage, ""); diff --git a/onnxruntime/test/providers/qnn/batch_multiplier_test.cc b/onnxruntime/test/providers/qnn/batch_multiplier_test.cc new file mode 100644 index 0000000000000..f836694cc11b3 --- /dev/null +++ b/onnxruntime/test/providers/qnn/batch_multiplier_test.cc @@ -0,0 +1,318 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#if !defined(ORT_MINIMAL_BUILD) + +#include +#include +#include +#include "core/graph/graph.h" +#include "core/graph/node_attr_utils.h" +#include "core/session/onnxruntime_session_options_config_keys.h" + +#include "test/providers/qnn/qnn_test_utils.h" + +#include "gtest/gtest.h" + +namespace onnxruntime { +namespace test { + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) +/** + * Tests the accuracy of using batch multiplier on QNN EP by running 3 inferences: + * + * 1. Run data with "batch multiplier batch size" on CPU EP (with model compiled with batch multiplier batch size) - baseline + * 2. Run data with "batch multiplier batch size" on QNN HTP (with model compiled with batch multiplier batch size) + * 3. Run data with "batch multiplier batch size" on QNN HTP (with model compiled with original batch size) + * + * This function checks that running #3 is at least as accurate (+- small tolerance) as running #2. + * We primarily measure accuracy by comparing both #2 and #3 to the baseline (#1). + * + * \param bm_model_fn Function that builds the model with "batch multiplier batch size". + * \param ori_model_fn Function that builds the model with "original batch size". + * \param qnn_options QNN EP provider options. + * \param opset_version The opset version. + * \param expected_ep_assignment Describes which nodes should be assigned to the EP. + * \param tolerance The percent tolerance (as fraction) QNN HTP using batch multiplier results are allowed to differ from without batch multiplier + * on QNN HTP. This tolerance is a percentage of the output range. + * \param log_severity The logger's severity setting. + * \param qnn_ctx_model_path Optional path to a QNN context cache model. + */ +inline void TestModelBatchMultiplierAccuracy( + const GetTestModelFn& bm_model_fn, + const GetTestModelFn& ori_model_fn, + const ProviderOptions& qnn_options, + int opset_version, + ExpectedEPNodeAssignment expected_ep_assignment, + float tolerance = 0.004f, + logging::Severity log_severity = logging::Severity::kERROR, + const std::string& qnn_ctx_model_path = "", + const std::unordered_map& session_option_pairs = {}) { + const std::unordered_map domain_to_version = {{"", opset_version}, {kMSDomain, 1}}; + + auto& logging_manager = DefaultLoggingManager(); + logging_manager.SetDefaultLoggerSeverity(log_severity); + + // 1. Create model with batch multiplier batch size and serialize it to a string. + onnxruntime::Model bm_model("bm_model", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + ModelTestBuilder bm_helper(bm_model.MainGraph()); + std::string bm_model_data; + bm_model_fn(bm_helper); + bm_helper.SetGraphOutputs(); + ASSERT_STATUS_OK(bm_model.MainGraph().Resolve()); + bm_model.ToProto().SerializeToString(&bm_model_data); + + // Run FP32 model on CPU EP and collect outputs (baseline). + std::vector cpu_bm_outputs; + InferenceModel(bm_model_data, "bm_model_logger", {}, ExpectedEPNodeAssignment::All, + bm_helper.feeds_, cpu_bm_outputs); + ASSERT_FALSE(cpu_bm_outputs.empty()); + + const size_t num_outputs = cpu_bm_outputs.size(); + + // Collect output values for comparison. + std::vector> output_vals; + output_vals.resize(num_outputs); + + for (size_t i = 0; i < num_outputs; i++) { + auto& tensor = cpu_bm_outputs[i].Get(); + output_vals[i] = tensor.DataAsSpan(); + } + + // 2. Create model with original batch size and serialize it to a string. + onnxruntime::Model ori_model("ori_model", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + ModelTestBuilder ori_helper(ori_model.MainGraph()); + std::string ori_model_data; + ori_model_fn(ori_helper); + ori_helper.SetGraphOutputs(); + ASSERT_STATUS_OK(ori_model.MainGraph().Resolve()); + ori_model.ToProto().SerializeToString(&ori_model_data); + + // 3. Run original batch size model on QNN HTP EP with batch multiplier batch size input + const bool is_qnn_ep = true; + TryEnableQNNSaver(const_cast(qnn_options)); + std::vector qnn_ori_outputs; + + if (!qnn_ctx_model_path.empty()) { + onnx::ModelProto model_proto; + onnxruntime::Model qnn_ctx_model; + ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto)); + std::string qnn_ctx_model_data; + model_proto.SerializeToString(&qnn_ctx_model_data); + InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options, + expected_ep_assignment, bm_helper.feeds_, qnn_ori_outputs, is_qnn_ep, session_option_pairs); + } else { + // To test batch multiplier, run original batch size model using batch multiplier batch size input data. + // Use bm_helper.feeds_ (batch multiplier size) instead of ori_helper.feeds_ (original size) for inference. + InferenceModel(ori_model_data, "ori_model_logger", qnn_options, expected_ep_assignment, + bm_helper.feeds_, qnn_ori_outputs, is_qnn_ep, session_option_pairs); + } + + // 4. Validate the outputs + // Since HTP runs on FP16, we check whether the error between HTP with batch multiplier and ORT CPU + // is smaller than the error between HTP without batch multiplier and ORT CPU. + if (expected_ep_assignment != ExpectedEPNodeAssignment::None) { + // Run batch multiplier batch size model using bathc muliplier batch size input data on QNN EP. + std::vector qnn_bm_outputs; + InferenceModel(bm_model_data, "bm_model_logger", qnn_options, expected_ep_assignment, + bm_helper.feeds_, qnn_bm_outputs, is_qnn_ep, session_option_pairs); + + ASSERT_EQ(qnn_ori_outputs.size(), num_outputs); + ASSERT_EQ(qnn_bm_outputs.size(), num_outputs); + + // Limit the error message count in case test with large data fails + constexpr size_t max_error_count = 10; + size_t error_count = 0; + + // Compare accuracy of ori@QNN_HTP results with bm@CPU_EP baseline + const std::string base_output_name = "output_"; + for (size_t i = 0; i < num_outputs; i++) { + const std::string debug_output_name = base_output_name + std::to_string(i); + auto& qnn_ori_tensor = qnn_ori_outputs[i].Get(); + auto& qnn_bm_tensor = qnn_bm_outputs[i].Get(); + + const size_t num_vals = output_vals[i].size(); + gsl::span cpu_bm_vals = output_vals[i]; + gsl::span qnn_ori_vals = qnn_ori_tensor.DataAsSpan(); + gsl::span qnn_bm_vals = qnn_bm_tensor.DataAsSpan(); + + ASSERT_EQ(num_vals, qnn_ori_vals.size()); + ASSERT_EQ(num_vals, qnn_bm_vals.size()); + + float max_qnn_ori_err = 0.0f; + float max_qnn_bm_err = 0.0f; + + for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) { + const float expected_val = cpu_bm_vals[j]; // bm@CPU_EP val ("ground-truth") + const float qnn_ori_val = qnn_ori_vals[j]; // ori@QNN_HTP val + const float qnn_bm_val = qnn_bm_vals[j]; + + // Calculate relative error of ori@QNN_HTP against bm@CPU_EP + constexpr float epsilon = 1e-16f; + const float qnn_ori_relative_err = std::fabs(expected_val - qnn_ori_val) / (std::fabs(expected_val) + epsilon); + const float qnn_bm_relative_err = std::fabs(expected_val - qnn_bm_val) / (std::fabs(expected_val) + epsilon); + + // error between w/ and w/o batch multiplier on QNN HTP + const float qnn_vals_err = std::fabs(qnn_ori_relative_err - qnn_bm_relative_err); + const bool is_as_accurate_as_without_bm = qnn_ori_relative_err <= qnn_bm_relative_err; + const bool qnn_vals_diff_within_tolerance = qnn_vals_err <= tolerance; + + const bool passed_test = is_as_accurate_as_without_bm || qnn_vals_diff_within_tolerance; + if (!passed_test) { + ++error_count; + } + EXPECT_TRUE(passed_test) + << "Inaccuracy detected for output '" << debug_output_name + << "', element " << j << ", tolerance=" << (tolerance * 100) << "%" + << ".\nExpected val (bm@CPU_EP): " << expected_val + << "\nori@QNN_HTP val: " << qnn_ori_val + << "\nbm@QNN_HTP val: " << qnn_bm_val + << "\nQNN HTP 'original batch size' Relative error: " << (qnn_ori_relative_err * 100) << "%" + << "\nQNN HTP 'batch multiplier batch size' Relative error: " << (qnn_bm_relative_err * 100) << "%"; + + max_qnn_ori_err = std::max(max_qnn_ori_err, qnn_ori_relative_err); + max_qnn_bm_err = std::max(max_qnn_bm_err, qnn_bm_relative_err); + } + + if (error_count > 0) { + std::cerr << std::endl + << "[WARNING]: Output " << i + << " required larger tolerance to pass accuracy checks" << std::endl + << "Max ori relative error against bm@CPU_EP = " << (max_qnn_ori_err * 100) << "%" << std::endl + << "Max bm relative error against bm@CPU_EP = " << (max_qnn_bm_err * 100) << "%" << std::endl + << "Tolerance used = " << (tolerance * 100) << "%" << std::endl; + } + } + } +} + +/** + * Tests batch multiplier accuracy by comparing QNN HTP backend (with batch multiplier) + * against ORT CPU backend (without batch multiplier). + * + * @param op_type The operator type (e.g., "Conv", "MatMul") + * @param input_defs Input definitions with original batch size + * @param input_bm_defs Input definitions with batch multiplier batch size + * @param attrs Operator attributes + * @param opset_version ONNX opset version + * @param expected_ep_assignment Expected EP node assignment + * @param op_domain Operator domain (default: kOnnxDomain) + * @param tolerance Relative error tolerance (default: 0.004) + */ +static void RunBatchMultiplierOpTest( + const std::string& op_type, + const std::vector>& input_defs, + const std::vector>& input_bm_defs, + const std::vector& attrs, + int opset_version, + ExpectedEPNodeAssignment expected_ep_assignment, + const std::string& op_domain = kOnnxDomain, + float tolerance = 0.004f) { + // Configure QNN HTP backend options + ProviderOptions provider_options; + provider_options["backend_type"] = "htp"; + ProviderOptions session_options; + session_options["session.disable_cpu_ep_fallback"] = "1"; + session_options["ep.qnn.enable_htp_batch_multiplier"] = "1"; + + // Build FP32 models + auto model_bm_fn = BuildOpTestCase(op_type, input_bm_defs, {}, attrs, op_domain); + auto model_fn = BuildOpTestCase(op_type, input_defs, {}, attrs, op_domain); + + // Test FP32 batch multiplier accuracy + TestModelBatchMultiplierAccuracy( + model_bm_fn, + model_fn, + provider_options, + opset_version, + expected_ep_assignment, + tolerance, + logging::Severity::kERROR, + "", + session_options); +} + +/** + * Helper function to test Conv operator with different batch multiplier sizes. + * + * @param batch_multiplier_size The batch size to use for inference (e.g., 2, 4, 8, 16) + */ +static void TestConvBatchMultiplier(int64_t batch_multiplier_size) { + // Constants + constexpr int64_t kOriginalBatchSize = 1; + constexpr int64_t kInputChannels = 1; + constexpr int64_t kInputHeight = 5; + constexpr int64_t kInputWidth = 5; + constexpr int64_t kKernelSize = 3; + constexpr size_t kWeightDataSize = kInputChannels * kInputChannels * kKernelSize * kKernelSize; + const size_t kInputDataSize = batch_multiplier_size * kOriginalBatchSize * kInputChannels * kInputHeight * kInputWidth; + + // Generate fixed weight data to ensure consistency across model builds + std::vector weight_data(kWeightDataSize); + std::default_random_engine weight_generator(12345); + std::uniform_real_distribution weight_distribution(-10.0f, 10.0f); + for (auto& val : weight_data) { + val = weight_distribution(weight_generator); + } + + // Generate fixed input data for reproducible results + std::vector input_data(kInputDataSize); + std::default_random_engine input_generator(6677); + std::uniform_real_distribution input_distribution(0.0f, 10.0f); + for (auto& val : input_data) { + val = input_distribution(input_generator); + } + + // Create input definitions with original batch size + std::vector> input_defs; + input_defs.push_back(TestInputDef( + {kOriginalBatchSize, kInputChannels, kInputHeight, kInputWidth}, + false, 0.0f, 10.0f)); // Random data OK for compilation only + input_defs.push_back(TestInputDef( + {kInputChannels, kInputChannels, kKernelSize, kKernelSize}, + true, weight_data)); + input_defs.push_back(TestInputDef({kInputChannels}, true, {2.0f})); + + // Create input definitions with batch multiplier size + std::vector> input_bm_defs; + input_bm_defs.push_back(TestInputDef( + {batch_multiplier_size, kInputChannels, kInputHeight, kInputWidth}, + false, input_data)); + input_bm_defs.push_back(TestInputDef( + {kInputChannels, kInputChannels, kKernelSize, kKernelSize}, + true, weight_data)); + input_bm_defs.push_back(TestInputDef({kInputChannels}, true, {2.0f})); + + // Configure Conv operator attributes + std::vector attrs; + attrs.push_back(utils::MakeAttribute("auto_pad", "NOTSET")); + attrs.push_back(utils::MakeAttribute("strides", std::vector{1, 1})); + attrs.push_back(utils::MakeAttribute("pads", std::vector{0, 0, 0, 0})); + attrs.push_back(utils::MakeAttribute("dilations", std::vector{1, 1})); + + RunBatchMultiplierOpTest("Conv", + input_defs, + input_bm_defs, + attrs, + 21, // opset version + ExpectedEPNodeAssignment::All, + kOnnxDomain); +} + +// Test batch multiplier accuracy for Conv operator with batch size 2, 8, 128. +TEST_F(QnnHTPBackendTests, BatchMultiplier_Conv) { + TestConvBatchMultiplier(2); + TestConvBatchMultiplier(8); + TestConvBatchMultiplier(128); +} + +#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) + +} // namespace test +} // namespace onnxruntime + +#endif