diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 64a434e2fe301..0c1f684615da8 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -415,3 +415,10 @@ static const char* const kOrtSessionOptionsFailOnSuboptimalCompiledModel =
 // "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
 // "sustained_high_performance". Default to "default".
 static const char* const kOrtEpDynamicOptionsQnnHtpPerformanceMode = "ep.dynamic.qnn_htp_performance_mode";
+
+// Enable QNN HTP batch multiplier
+//
+// Option values:
+// - "0": QNN htp batch multiplier is disabled. [DEFAULT]
+// - "1": QNN htp batch multiplier is enabaled.
+static const char* const kOrtSessionOptionsQnnHtpBatchMultiplier = "ep.qnn.enable_htp_batch_multiplier";
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 175a76b590895..30422e8654e26 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -174,6 +174,46 @@ Status QnnModel::FinalizeGraphs(const logging::Logger& logger) {
   return Status::OK();
 }
 
+// Helper function to check if ORT tensor shape matches expected QNN tensor shape (excluding batch dimension at index 0)
+template <typename OrtValueType>
+static Status CheckShape(const OrtValueType& ort_tensor,
+                         const QnnTensorInfo& qnn_io_info) {
+  const auto input_output_shape = ort_tensor.GetTensorTypeAndShapeInfo().GetShape();
+  const auto shape_size = input_output_shape.size();
+  const auto& expected_shape = qnn_io_info.ori_dimensions_;
+  const auto expected_shape_size = expected_shape.size();
+  const auto& tensor_name = qnn_io_info.tensor_wrapper->GetName();
+
+  ORT_RETURN_IF_NOT(shape_size == expected_shape_size,
+                    "Invalid rank for tensor: ", tensor_name,
+                    " Got: ", shape_size, " Expected: ", expected_shape_size,
+                    " Please fix either the inputs/outputs or the model.");
+
+  // Collect all invalid dimension indices (skip batch dimension at index 0)
+  InlinedVector<size_t> invalid_dim_indices;
+  invalid_dim_indices.reserve(shape_size);
+  for (size_t i = 1; i < shape_size; ++i) {
+    if (input_output_shape[i] != static_cast<int64_t>(expected_shape[i])) {
+      invalid_dim_indices.push_back(i);
+    }
+  }
+
+  if (!invalid_dim_indices.empty()) {
+    std::ostringstream ostr;
+    ostr << "Got invalid dimensions for tensor: " << tensor_name
+         << " for the following indices (excluding batch dimension)\n";
+    for (const auto idx : invalid_dim_indices) {
+      ostr << " index: " << idx
+           << " Got: " << input_output_shape[idx]
+           << " Expected: " << expected_shape[idx] << "\n";
+    }
+    ostr << " Please fix either the inputs/outputs or the model.";
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, ostr.str());
+  }
+
+  return Status::OK();
+}
+
 Status QnnModel::SetupQnnInputOutput(const logging::Logger& logger) {
   LOGS(logger, VERBOSE) << "Setting up QNN input/output for graph: " << graph_info_->Name();
 
@@ -239,9 +279,28 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context,
     return element_size * length;
   };
 
+  constexpr size_t BATCH_DIMENSION_INDEX = 0;
+  uint32_t batch_multiplier = static_cast<uint32_t>(
+      TensorDataSize(context.GetInput(qnn_input_infos_[0].ort_index)) / qnn_input_infos_[0].tensor_byte_size);
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  // Check if batch multiplier is used, and it's only supported on the HTP backend.
+  ORT_RETURN_IF(batch_multiplier != 1 && !IsNpuBackend(backend_type),
+                "Batch multiplier is only supported on HTP backend, but current backend is: ",
+                static_cast<int>(backend_type));
+
+  // ===== Phase 1: Prepare inputs (thread-safe, no lock needed) =====
   std::vector<Qnn_Tensor_t> qnn_inputs;
   qnn_inputs.reserve(qnn_input_infos_.size());
 
+  // The dimensions field in Qnn_Tensor_t is a pointer, so qnn_inputs.push_back() performs a shallow copy.
+  // Multiple threads would share the same dimensions array, leading to race conditions when directly modifying batch size.
+  // To ensure thread safety, we create independent dimension copies for each thread when batch multiplier > 1.
+  // These copies are stored in dimensions_copies to maintain their lifetime throughout execution.
+  std::vector<std::vector<uint32_t>> input_dimensions_copies;
+  if (batch_multiplier > 1) {
+    input_dimensions_copies.reserve(qnn_input_infos_.size());
+  }
+
   for (const auto& qnn_input_info : qnn_input_infos_) {
     LOGS(logger, VERBOSE) << "model_input = " << qnn_input_info.tensor_wrapper->GetName()
                           << " index = " << qnn_input_info.ort_index;
@@ -249,46 +308,110 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context,
     auto ort_tensor_size = TensorDataSize(ort_input_tensor);
     LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_input_info.tensor_byte_size
                           << " Ort tensor size: " << ort_tensor_size;
-    ORT_RETURN_IF_NOT(qnn_input_info.tensor_byte_size == ort_tensor_size,
-                      "ORT Tensor data size does not match QNN tensor data size.");
-
+    LOGS(logger, VERBOSE) << "Original Qnn input tensor shape: " << qnn_input_info.ori_dimensions_;
+    LOGS(logger, VERBOSE) << "Qnn input tensor shape: " << qnn_input_info.tensor_wrapper->GetTensorDims();
+    LOGS(logger, VERBOSE) << "Ort input tensor shape: " << ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape();
+    ORT_RETURN_IF_NOT(ort_tensor_size % qnn_input_info.tensor_byte_size == 0,
+                      "ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (",
+                      qnn_input_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ",
+                      "Expected: exact match or integer multiple.");
+
+    // check if there is only one batch multiplier value.
+    uint32_t bm = static_cast<uint32_t>(ort_tensor_size / qnn_input_info.tensor_byte_size);
+    ORT_RETURN_IF_NOT(bm == batch_multiplier,
+                      "Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm);
+
+    // Get QNN tensor (shallow copy)
     qnn_inputs.push_back(qnn_input_info.tensor_wrapper->GetQnnTensor());
 
+    // Modify batch dimensions
+    if (batch_multiplier > 1) {
+      // Check dimensions except for batch dimension (index 0)
+      ORT_RETURN_IF_ERROR(CheckShape(ort_input_tensor, qnn_input_info));
+
+      // Create independent dimensions copy to avoid race conditions
+      std::vector<uint32_t> dims_copy = qnn_input_info.ori_dimensions_;
+      dims_copy[BATCH_DIMENSION_INDEX] =
+          static_cast<uint32_t>(ort_input_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]);
+      LOGS(logger, VERBOSE) << "qnn_inputs batch size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX];
+      input_dimensions_copies.push_back(std::move(dims_copy));
+      // Point QNN tensor to the independent dimensions copy
+      SetQnnTensorDim(qnn_inputs.back(), input_dimensions_copies.back());
+    }
+
     ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory(
         logger,
         *qnn_backend_manager_,
         *static_cast<const OrtMemoryInfo*>(ort_input_tensor.GetTensorMemoryInfo()),
-        const_cast<void*>(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size,
+        const_cast<void*>(ort_input_tensor.GetTensorRawData()), qnn_input_info.tensor_byte_size * batch_multiplier,
         graph_info_->GraphContext(),
         qnn_inputs.back()));
   }
 
+  // ===== Phase 2: Prepare outputs (thread-safe, no lock needed) =====
   std::vector<Qnn_Tensor_t> qnn_outputs;
   qnn_outputs.reserve(qnn_output_infos_.size());
 
+  // Create independent copies of dimensions for outputs
+  std::vector<std::vector<uint32_t>> output_dimensions_copies;
+  if (batch_multiplier > 1) {
+    output_dimensions_copies.reserve(qnn_output_infos_.size());
+  }
+
   for (auto& qnn_output_info : qnn_output_infos_) {
     const std::string& model_output_name = qnn_output_info.tensor_wrapper->GetName();
     LOGS(logger, VERBOSE) << "model_output = " << model_output_name << " index = " << qnn_output_info.ort_index;
     const auto& ort_output_info = GetOutputInfo(model_output_name);
-    const std::vector<int64_t>& output_shape = ort_output_info->shape_;
+    // Adjust output shape to match input batch size
+    std::vector<int64_t> output_shape = ort_output_info->shape_;
+    if (batch_multiplier > 1) {
+      output_shape[BATCH_DIMENSION_INDEX] *= batch_multiplier;
+      LOGS(logger, VERBOSE) << "batch multiplier triggered: " << batch_multiplier;
+      LOGS(logger, VERBOSE) << "Modify ORT output batch size to : " << output_shape[BATCH_DIMENSION_INDEX];
+    }
     auto ort_output_tensor = context.GetOutput(qnn_output_info.ort_index, output_shape.data(), output_shape.size());
     auto ort_tensor_size = TensorDataSize(ort_output_tensor);
     LOGS(logger, VERBOSE) << "Qnn tensor size: " << qnn_output_info.tensor_byte_size
                           << " Ort tensor size: " << ort_tensor_size;
-    ORT_RETURN_IF_NOT(qnn_output_info.tensor_byte_size == ort_tensor_size,
-                      "ORT Tensor data size does not match QNN tensor data size");
-
+    LOGS(logger, VERBOSE) << "Original Qnn output tensor shape: " << qnn_output_info.ori_dimensions_;
+    LOGS(logger, VERBOSE) << "ORT output tensor shape: " << ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape();
+    LOGS(logger, VERBOSE) << "Qnn output tensor shape: " << qnn_output_info.tensor_wrapper->GetTensorDims();
+    ORT_RETURN_IF_NOT(ort_tensor_size % qnn_output_info.tensor_byte_size == 0,
+                      "ORT tensor size (", ort_tensor_size, " bytes) must match QNN tensor size (",
+                      qnn_output_info.tensor_byte_size, " bytes) or be a valid batch multiplier. ",
+                      "Expected: exact match or integer multiple.");
+    uint32_t bm = static_cast<uint32_t>(ort_tensor_size / qnn_output_info.tensor_byte_size);
+    ORT_RETURN_IF_NOT(bm == batch_multiplier,
+                      "Batch multiplier should be the same across all the inputs. Expected: ", batch_multiplier, ", Got: ", bm);
+
+    // Get QNN tensor (shallow copy)
     qnn_outputs.push_back(qnn_output_info.tensor_wrapper->GetQnnTensor());
 
+    // Modify batch dimensions
+    if (batch_multiplier > 1) {
+      // Check dimensions except for batch dimension (index 0)
+      ORT_RETURN_IF_ERROR(CheckShape(ort_output_tensor, qnn_output_info));
+
+      // Create independent dimensions copy to avoid race conditions
+      std::vector<uint32_t> dims_copy = qnn_output_info.ori_dimensions_;
+      dims_copy[BATCH_DIMENSION_INDEX] =
+          static_cast<uint32_t>(ort_output_tensor.GetTensorTypeAndShapeInfo().GetShape()[BATCH_DIMENSION_INDEX]);
+      LOGS(logger, VERBOSE) << "qnn_outputs batch_size (bm triggered): " << dims_copy[BATCH_DIMENSION_INDEX];
+      output_dimensions_copies.push_back(std::move(dims_copy));
+      // Point QNN tensor to the independent dimensions copy
+      SetQnnTensorDim(qnn_outputs.back(), output_dimensions_copies.back());
+    }
+
     ORT_RETURN_IF_ERROR(BindQnnTensorMemoryToOrtValueMemory(
         logger,
         *qnn_backend_manager_,
         *static_cast<const OrtMemoryInfo*>(ort_output_tensor.GetTensorMemoryInfo()),
-        ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size,
+        ort_output_tensor.GetTensorMutableRawData(), qnn_output_info.tensor_byte_size * batch_multiplier,
         graph_info_->GraphContext(),
         qnn_outputs.back()));
   }
 
+  // ===== Phase 3: Execute graph (requires mutex lock) =====
   Qnn_ErrorHandle_t execute_status = QNN_GRAPH_NO_ERROR;
   {
     const auto& qnn_interface = qnn_backend_manager_->GetQnnInterface();
@@ -355,6 +478,7 @@ Status QnnModel::SetupTensors(std::vector<QnnTensorInfo>& qnn_tensor_infos,
     qnn_tensor_info.tensor_wrapper = &tensor_wrapper;
     qnn_tensor_info.tensor_byte_size = static_cast<uint32_t>(length);
     qnn_tensor_info.ort_index = ort_index;
+    qnn_tensor_info.ori_dimensions_.assign(tensor_wrapper.GetTensorDims().begin(), tensor_wrapper.GetTensorDims().end());
   }
   // The number of graph inputs and the number of tensor wrappers may not match.
   // - For example, for ResizeNearestNeighbor op, Qnn only cares about the 1st input,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 9f10b319f1a57..7ee2cf6785dea 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -19,6 +19,7 @@ struct QnnTensorInfo {
   const QnnTensorWrapper* tensor_wrapper = nullptr;
   uint32_t tensor_byte_size = 0;
   size_t ort_index = 0;
+  std::vector<uint32_t> ori_dimensions_;
 };
 
 class QnnModel {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index b234f7df375e9..56f632490af8d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -454,6 +454,14 @@ Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
  */
 Status GetPermToLastAxis(uint32_t axis, uint32_t rank, std::vector<uint32_t>& perm);
 
+template <typename T>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<T>& vec) {
+  for (const auto& elem : vec) {
+    out << elem << " ";
+  }
+  return out;
+}
+
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 2bdbfb9c1c62e..d1fe115b6eb9b 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -304,6 +304,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
 
+    qnn_htp_batch_multiplier_ = config_options->GetConfigOrDefault(
+                                    kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1";
+
     context_cache_enabled_ = config_options->GetConfigOrDefault(
                                  kOrtSessionOptionEpContextEnable, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
@@ -961,6 +964,20 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     return result;
   }
 
+  // Report error if QNN HTP batch multiplier is used while do not disable cpu ep fallback.
+  // Since QNN HTP batch multiplier only supports whole graph running on HTP backend.
+  if (qnn_htp_batch_multiplier_) {
+    if (!disable_cpu_ep_fallback_) {
+      LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while CPU fallback is not disabled.\n Please add session options 'session.disable_cpu_ep_fallback|1'";
+      return result;
+    }
+    const auto& backend_type = qnn_backend_manager_->GetQnnBackendType();
+    if (backend_type != qnn::QnnBackendType::HTP && backend_type != qnn::QnnBackendType::HTP_FP16) {
+      LOGS(logger, ERROR) << "Qnn HTP batch multiplier is used while not using HTP backend.\n Please use HTP backend type";
+      return result;
+    }
+  }
+
   if ((context_cache_enabled_ || is_qnn_ctx_model) && !IsQpuBackend(qnn_backend_manager_->GetQnnBackendType())) {
     LOGS(logger, ERROR) << "Qnn context cache only works for HTP/DSP/GPU backend.";
     return result;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 6adf613932d66..57047ed563c42 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -90,7 +90,8 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool context_cache_enabled_ = false;
   std::string context_cache_path_cfg_ = "";
   std::string context_node_name_prefix_ = "";
-  bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
+  bool disable_cpu_ep_fallback_ = false;   // True if CPU EP fallback has been disabled for this session.
+  bool qnn_htp_batch_multiplier_ = false;  // True if qnn htp batch multiplier options is enabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
   bool enable_vtcm_backup_buffer_sharing_ = false;
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e3291cdce62c5..fcd44183ab16b 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2981,10 +2981,17 @@ Status InferenceSession::Run(const RunOptions& run_options,
 
       // log evaluation start to trace logging provider
       env.GetTelemetryProvider().LogEvaluationStart(session_id_);
-
+#ifdef USE_QNN
+      const bool batch_multiplier = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsQnnHtpBatchMultiplier, "0") == "1";
+      if (!batch_multiplier) {
+        LOGS(*session_logger_, INFO) << "Enable QNN HTP batch mutliplier. Don't validate the inputs/outptus";
+        ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds));
+        ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches));
+      }
+#else
       ORT_RETURN_IF_ERROR_SESSIONID_(ValidateInputs(feed_names, feeds));
       ORT_RETURN_IF_ERROR_SESSIONID_(ValidateOutputs(output_names, p_fetches));
-
+#endif
       // shrink certain default memory arenas if the user has requested for it
       const std::string& shrink_memory_arenas =
           run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigEnableMemoryArenaShrinkage, "");
diff --git a/onnxruntime/test/providers/qnn/batch_multiplier_test.cc b/onnxruntime/test/providers/qnn/batch_multiplier_test.cc
new file mode 100644
index 0000000000000..f836694cc11b3
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/batch_multiplier_test.cc
@@ -0,0 +1,318 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <filesystem>
+#include <variant>
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+/**
+ * Tests the accuracy of using batch multiplier on QNN EP by running 3 inferences:
+ *
+ * 1. Run data with "batch multiplier batch size" on CPU EP (with model compiled with batch multiplier batch size) - baseline
+ * 2. Run data with "batch multiplier batch size" on QNN HTP (with model compiled with batch multiplier batch size)
+ * 3. Run data with "batch multiplier batch size" on QNN HTP (with model compiled with original batch size)
+ *
+ * This function checks that running #3 is at least as accurate (+- small tolerance) as running #2.
+ * We primarily measure accuracy by comparing both #2 and #3 to the baseline (#1).
+ *
+ * \param bm_model_fn Function that builds the model with "batch multiplier batch size".
+ * \param ori_model_fn Function that builds the model with "original batch size".
+ * \param qnn_options QNN EP provider options.
+ * \param opset_version The opset version.
+ * \param expected_ep_assignment Describes which nodes should be assigned to the EP.
+ * \param tolerance The percent tolerance (as fraction) QNN HTP using batch multiplier results are allowed to differ from without batch multiplier
+ *                  on QNN HTP. This tolerance is a percentage of the output range.
+ * \param log_severity The logger's severity setting.
+ * \param qnn_ctx_model_path Optional path to a QNN context cache model.
+ */
+inline void TestModelBatchMultiplierAccuracy(
+    const GetTestModelFn& bm_model_fn,
+    const GetTestModelFn& ori_model_fn,
+    const ProviderOptions& qnn_options,
+    int opset_version,
+    ExpectedEPNodeAssignment expected_ep_assignment,
+    float tolerance = 0.004f,
+    logging::Severity log_severity = logging::Severity::kERROR,
+    const std::string& qnn_ctx_model_path = "",
+    const std::unordered_map<std::string, std::string>& session_option_pairs = {}) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(log_severity);
+
+  // 1. Create model with batch multiplier batch size and serialize it to a string.
+  onnxruntime::Model bm_model("bm_model", false, ModelMetaData(), PathString(),
+                              IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                              logging_manager.DefaultLogger());
+  ModelTestBuilder bm_helper(bm_model.MainGraph());
+  std::string bm_model_data;
+  bm_model_fn(bm_helper);
+  bm_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(bm_model.MainGraph().Resolve());
+  bm_model.ToProto().SerializeToString(&bm_model_data);
+
+  // Run FP32 model on CPU EP and collect outputs (baseline).
+  std::vector<OrtValue> cpu_bm_outputs;
+  InferenceModel(bm_model_data, "bm_model_logger", {}, ExpectedEPNodeAssignment::All,
+                 bm_helper.feeds_, cpu_bm_outputs);
+  ASSERT_FALSE(cpu_bm_outputs.empty());
+
+  const size_t num_outputs = cpu_bm_outputs.size();
+
+  // Collect output values for comparison.
+  std::vector<gsl::span<const float>> output_vals;
+  output_vals.resize(num_outputs);
+
+  for (size_t i = 0; i < num_outputs; i++) {
+    auto& tensor = cpu_bm_outputs[i].Get<Tensor>();
+    output_vals[i] = tensor.DataAsSpan<float>();
+  }
+
+  // 2. Create model with original batch size and serialize it to a string.
+  onnxruntime::Model ori_model("ori_model", false, ModelMetaData(), PathString(),
+                               IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                               logging_manager.DefaultLogger());
+  ModelTestBuilder ori_helper(ori_model.MainGraph());
+  std::string ori_model_data;
+  ori_model_fn(ori_helper);
+  ori_helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(ori_model.MainGraph().Resolve());
+  ori_model.ToProto().SerializeToString(&ori_model_data);
+
+  // 3. Run original batch size model on QNN HTP EP with batch multiplier batch size input
+  const bool is_qnn_ep = true;
+  TryEnableQNNSaver(const_cast<ProviderOptions&>(qnn_options));
+  std::vector<OrtValue> qnn_ori_outputs;
+
+  if (!qnn_ctx_model_path.empty()) {
+    onnx::ModelProto model_proto;
+    onnxruntime::Model qnn_ctx_model;
+    ASSERT_STATUS_OK(qnn_ctx_model.Load(ToPathString(qnn_ctx_model_path), model_proto));
+    std::string qnn_ctx_model_data;
+    model_proto.SerializeToString(&qnn_ctx_model_data);
+    InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options,
+                   expected_ep_assignment, bm_helper.feeds_, qnn_ori_outputs, is_qnn_ep, session_option_pairs);
+  } else {
+    // To test batch multiplier, run original batch size model using batch multiplier batch size input data.
+    // Use bm_helper.feeds_ (batch multiplier size) instead of ori_helper.feeds_ (original size) for inference.
+    InferenceModel(ori_model_data, "ori_model_logger", qnn_options, expected_ep_assignment,
+                   bm_helper.feeds_, qnn_ori_outputs, is_qnn_ep, session_option_pairs);
+  }
+
+  // 4. Validate the outputs
+  // Since HTP runs on FP16, we check whether the error between HTP with batch multiplier and ORT CPU
+  // is smaller than the error between HTP without batch multiplier and ORT CPU.
+  if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
+    // Run batch multiplier batch size model using bathc muliplier batch size input data on QNN EP.
+    std::vector<OrtValue> qnn_bm_outputs;
+    InferenceModel(bm_model_data, "bm_model_logger", qnn_options, expected_ep_assignment,
+                   bm_helper.feeds_, qnn_bm_outputs, is_qnn_ep, session_option_pairs);
+
+    ASSERT_EQ(qnn_ori_outputs.size(), num_outputs);
+    ASSERT_EQ(qnn_bm_outputs.size(), num_outputs);
+
+    // Limit the error message count in case test with large data fails
+    constexpr size_t max_error_count = 10;
+    size_t error_count = 0;
+
+    // Compare accuracy of ori@QNN_HTP results with bm@CPU_EP baseline
+    const std::string base_output_name = "output_";
+    for (size_t i = 0; i < num_outputs; i++) {
+      const std::string debug_output_name = base_output_name + std::to_string(i);
+      auto& qnn_ori_tensor = qnn_ori_outputs[i].Get<Tensor>();
+      auto& qnn_bm_tensor = qnn_bm_outputs[i].Get<Tensor>();
+
+      const size_t num_vals = output_vals[i].size();
+      gsl::span<const float> cpu_bm_vals = output_vals[i];
+      gsl::span<const float> qnn_ori_vals = qnn_ori_tensor.DataAsSpan<float>();
+      gsl::span<const float> qnn_bm_vals = qnn_bm_tensor.DataAsSpan<float>();
+
+      ASSERT_EQ(num_vals, qnn_ori_vals.size());
+      ASSERT_EQ(num_vals, qnn_bm_vals.size());
+
+      float max_qnn_ori_err = 0.0f;
+      float max_qnn_bm_err = 0.0f;
+
+      for (size_t j = 0; j < num_vals && error_count < max_error_count; j++) {
+        const float expected_val = cpu_bm_vals[j];  // bm@CPU_EP val ("ground-truth")
+        const float qnn_ori_val = qnn_ori_vals[j];  // ori@QNN_HTP val
+        const float qnn_bm_val = qnn_bm_vals[j];
+
+        // Calculate relative error of ori@QNN_HTP against bm@CPU_EP
+        constexpr float epsilon = 1e-16f;
+        const float qnn_ori_relative_err = std::fabs(expected_val - qnn_ori_val) / (std::fabs(expected_val) + epsilon);
+        const float qnn_bm_relative_err = std::fabs(expected_val - qnn_bm_val) / (std::fabs(expected_val) + epsilon);
+
+        // error between w/ and w/o batch multiplier on QNN HTP
+        const float qnn_vals_err = std::fabs(qnn_ori_relative_err - qnn_bm_relative_err);
+        const bool is_as_accurate_as_without_bm = qnn_ori_relative_err <= qnn_bm_relative_err;
+        const bool qnn_vals_diff_within_tolerance = qnn_vals_err <= tolerance;
+
+        const bool passed_test = is_as_accurate_as_without_bm || qnn_vals_diff_within_tolerance;
+        if (!passed_test) {
+          ++error_count;
+        }
+        EXPECT_TRUE(passed_test)
+            << "Inaccuracy detected for output '" << debug_output_name
+            << "', element " << j << ", tolerance=" << (tolerance * 100) << "%"
+            << ".\nExpected val (bm@CPU_EP): " << expected_val
+            << "\nori@QNN_HTP val: " << qnn_ori_val
+            << "\nbm@QNN_HTP val: " << qnn_bm_val
+            << "\nQNN HTP 'original batch size' Relative error: " << (qnn_ori_relative_err * 100) << "%"
+            << "\nQNN HTP 'batch multiplier batch size' Relative error: " << (qnn_bm_relative_err * 100) << "%";
+
+        max_qnn_ori_err = std::max(max_qnn_ori_err, qnn_ori_relative_err);
+        max_qnn_bm_err = std::max(max_qnn_bm_err, qnn_bm_relative_err);
+      }
+
+      if (error_count > 0) {
+        std::cerr << std::endl
+                  << "[WARNING]: Output " << i
+                  << " required larger tolerance to pass accuracy checks" << std::endl
+                  << "Max ori relative error against bm@CPU_EP = " << (max_qnn_ori_err * 100) << "%" << std::endl
+                  << "Max bm relative error against bm@CPU_EP = " << (max_qnn_bm_err * 100) << "%" << std::endl
+                  << "Tolerance used = " << (tolerance * 100) << "%" << std::endl;
+      }
+    }
+  }
+}
+
+/**
+ * Tests batch multiplier accuracy by comparing QNN HTP backend (with batch multiplier)
+ * against ORT CPU backend (without batch multiplier).
+ *
+ * @param op_type The operator type (e.g., "Conv", "MatMul")
+ * @param input_defs Input definitions with original batch size
+ * @param input_bm_defs Input definitions with batch multiplier batch size
+ * @param attrs Operator attributes
+ * @param opset_version ONNX opset version
+ * @param expected_ep_assignment Expected EP node assignment
+ * @param op_domain Operator domain (default: kOnnxDomain)
+ * @param tolerance Relative error tolerance (default: 0.004)
+ */
+static void RunBatchMultiplierOpTest(
+    const std::string& op_type,
+    const std::vector<TestInputDef<float>>& input_defs,
+    const std::vector<TestInputDef<float>>& input_bm_defs,
+    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+    int opset_version,
+    ExpectedEPNodeAssignment expected_ep_assignment,
+    const std::string& op_domain = kOnnxDomain,
+    float tolerance = 0.004f) {
+  // Configure QNN HTP backend options
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  ProviderOptions session_options;
+  session_options["session.disable_cpu_ep_fallback"] = "1";
+  session_options["ep.qnn.enable_htp_batch_multiplier"] = "1";
+
+  // Build FP32 models
+  auto model_bm_fn = BuildOpTestCase<float>(op_type, input_bm_defs, {}, attrs, op_domain);
+  auto model_fn = BuildOpTestCase<float>(op_type, input_defs, {}, attrs, op_domain);
+
+  // Test FP32 batch multiplier accuracy
+  TestModelBatchMultiplierAccuracy(
+      model_bm_fn,
+      model_fn,
+      provider_options,
+      opset_version,
+      expected_ep_assignment,
+      tolerance,
+      logging::Severity::kERROR,
+      "",
+      session_options);
+}
+
+/**
+ * Helper function to test Conv operator with different batch multiplier sizes.
+ *
+ * @param batch_multiplier_size The batch size to use for inference (e.g., 2, 4, 8, 16)
+ */
+static void TestConvBatchMultiplier(int64_t batch_multiplier_size) {
+  // Constants
+  constexpr int64_t kOriginalBatchSize = 1;
+  constexpr int64_t kInputChannels = 1;
+  constexpr int64_t kInputHeight = 5;
+  constexpr int64_t kInputWidth = 5;
+  constexpr int64_t kKernelSize = 3;
+  constexpr size_t kWeightDataSize = kInputChannels * kInputChannels * kKernelSize * kKernelSize;
+  const size_t kInputDataSize = batch_multiplier_size * kOriginalBatchSize * kInputChannels * kInputHeight * kInputWidth;
+
+  // Generate fixed weight data to ensure consistency across model builds
+  std::vector<float> weight_data(kWeightDataSize);
+  std::default_random_engine weight_generator(12345);
+  std::uniform_real_distribution<float> weight_distribution(-10.0f, 10.0f);
+  for (auto& val : weight_data) {
+    val = weight_distribution(weight_generator);
+  }
+
+  // Generate fixed input data for reproducible results
+  std::vector<float> input_data(kInputDataSize);
+  std::default_random_engine input_generator(6677);
+  std::uniform_real_distribution<float> input_distribution(0.0f, 10.0f);
+  for (auto& val : input_data) {
+    val = input_distribution(input_generator);
+  }
+
+  // Create input definitions with original batch size
+  std::vector<TestInputDef<float>> input_defs;
+  input_defs.push_back(TestInputDef<float>(
+      {kOriginalBatchSize, kInputChannels, kInputHeight, kInputWidth},
+      false, 0.0f, 10.0f));  // Random data OK for compilation only
+  input_defs.push_back(TestInputDef<float>(
+      {kInputChannels, kInputChannels, kKernelSize, kKernelSize},
+      true, weight_data));
+  input_defs.push_back(TestInputDef<float>({kInputChannels}, true, {2.0f}));
+
+  // Create input definitions with batch multiplier size
+  std::vector<TestInputDef<float>> input_bm_defs;
+  input_bm_defs.push_back(TestInputDef<float>(
+      {batch_multiplier_size, kInputChannels, kInputHeight, kInputWidth},
+      false, input_data));
+  input_bm_defs.push_back(TestInputDef<float>(
+      {kInputChannels, kInputChannels, kKernelSize, kKernelSize},
+      true, weight_data));
+  input_bm_defs.push_back(TestInputDef<float>({kInputChannels}, true, {2.0f}));
+
+  // Configure Conv operator attributes
+  std::vector<ONNX_NAMESPACE::AttributeProto> attrs;
+  attrs.push_back(utils::MakeAttribute("auto_pad", "NOTSET"));
+  attrs.push_back(utils::MakeAttribute("strides", std::vector<int64_t>{1, 1}));
+  attrs.push_back(utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}));
+  attrs.push_back(utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}));
+
+  RunBatchMultiplierOpTest("Conv",
+                           input_defs,
+                           input_bm_defs,
+                           attrs,
+                           21,  // opset version
+                           ExpectedEPNodeAssignment::All,
+                           kOnnxDomain);
+}
+
+// Test batch multiplier accuracy for Conv operator with batch size 2, 8, 128.
+TEST_F(QnnHTPBackendTests, BatchMultiplier_Conv) {
+  TestConvBatchMultiplier(2);
+  TestConvBatchMultiplier(8);
+  TestConvBatchMultiplier(128);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif