[QNN-EP] Add LoraV2 Support with offline QNN context binary

chuteng-quic · chuteng-quic · commit 85f7a21535dd · 2025-03-13T16:39:10.000+08:00
Description
- Add the new run option called lora_config to feed the information from
  lora binary
- Parse and apply the lora binary in OnRunStart

Motivation and Context
- Support Lora Adapter Binary with QNN Context Binary Usage
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -43,6 +43,9 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_
 // Set RPC control latency for QNN HTP backend
 static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
 
+// Set QNN Lora Config File for apply Lora in QNN context binary
+static const char* const kOrtRunOptionsConfigQnnLoraConfig = "qnn.lora_config";
+
 // Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
 // The value should be an integer. If the value is not set, the default value is 0 and
 // ORT session only captures one cuda graph before another capture is requested.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -52,6 +52,74 @@ static const char* DlError() {
 #endif
 }
 
+Status readBinaryFromFile(std::string filePath, uint8_t* buffer, size_t bufferSize) {
+  ORT_RETURN_IF(nullptr == buffer, "Binary buffer is nullptr");
+  std::ifstream in(filePath, std::ifstream::binary);
+  ORT_RETURN_IF(!in, "Failed to open input file: ", filePath.c_str());
+  ORT_RETURN_IF(!in.read(reinterpret_cast<char*>(buffer), bufferSize), "Failed to read the contents of: ", filePath.c_str());
+  return Status::OK();
+}
+
+
+Status QnnBackendManager::ParseLoraConfig(std::string lora_config_path) {
+
+  LOGS_DEFAULT(INFO) << "Acquiring the QnnInterface " << lora_config_path;
+
+  QnnInterface_t* backend_interface_provider{nullptr};
+  auto rt = GetQnnInterfaceProvider<QnnInterfaceGetProvidersFn_t,
+                                    QnnInterface_t>(backend_path_.c_str(),
+                                                    "QnnInterface_getProviders",
+                                                    &backend_lib_handle_,
+                                                    {QNN_API_VERSION_MAJOR,
+                                                     QNN_API_VERSION_MINOR,
+                                                     QNN_API_VERSION_PATCH},
+                                                    &backend_interface_provider);
+
+  ORT_RETURN_IF_ERROR(rt);
+  qnn_interface_ = backend_interface_provider->QNN_INTERFACE_VER_NAME;
+
+  // QNN Lora Config file format should be a single line, with the graph name first,
+  // followed by the qnn lora context binary path, separated by a semicolon (;)
+  // Example: <graph_name>;<binary_path>
+  LOGS_DEFAULT(INFO) << "Loading Lora Config " << lora_config_path;
+  std::ifstream file(lora_config_path);
+  std::string line;
+
+  if (file.is_open()) {
+      if (std::getline(file, line)) {
+          std::istringstream ss(line);
+          std::string graph_name;
+          std::string lora_adapter_bin_path;
+
+          if (std::getline(ss, graph_name, ';') && std::getline(ss, lora_adapter_bin_path)) {
+              size_t bufferSize = std::filesystem::file_size(lora_adapter_bin_path.c_str());
+
+              ORT_RETURN_IF(0 == bufferSize,  "Received path to an empty file. Nothing to deserialize.");
+              std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(bufferSize);
+              void *voidBufferPtr               = static_cast<void *>(buffer.get());
+              QnnContext_Buffer_t contextBuffer{QNN_CONTEXT_BUFFER_VERSION_1,
+                                                {QNN_CONTEXTMEMTYPE_RAW, {voidBufferPtr, bufferSize}}};
+
+              auto status = readBinaryFromFile(lora_adapter_bin_path,
+                                               reinterpret_cast<uint8_t *>(buffer.get()),
+                                               bufferSize);
+
+              ORT_RETURN_IF(status != Status::OK(),  "Failed to read binary data.");
+              Qnn_GraphHandle_t graph;
+              qnn_interface_.graphRetrieve(contexts_[0], graph_name.c_str(), &graph);
+
+              qnn_interface_.contextApplyBinarySection(
+                contexts_[0], graph, QNN_CONTEXT_SECTION_UPDATABLE, &contextBuffer, profile_backend_handle_, nullptr);
+          }
+      }
+      file.close();
+  } else {
+      LOGS_DEFAULT(ERROR) << "Unable to load Lora Config " << lora_config_path;
+  }
+
+  return Status::OK();
+}
+
 template <typename F, class T>
 Status QnnBackendManager::GetQnnInterfaceProvider(const char* lib_path,
                                                   const char* interface_provider_name,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -140,6 +140,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
                                        const Qnn_Tensor_t& qnn_tensor,
                                        Qnn_MemHandle_t& mem_handle);
 
+  Status ParseLoraConfig(std::string lora_config);
+
  private:
   Status LoadBackend();
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -1202,6 +1202,12 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio
     }
   }
 
+  std::string lora_config = "";
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnLoraConfig, lora_config)){
+    LOGS_DEFAULT(VERBOSE) << "lora_config: " << lora_config;
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->ParseLoraConfig(lora_config));
+  }
+
   return Status::OK();
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1202,6 +1202,12 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio`
`1202`	`1202`	`}`
`1203`	`1203`	`}`
`1204`	`1204`
	`1205`	`+ std::string lora_config = "";`
	`1206`	`+ if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnLoraConfig, lora_config)){`
	`1207`	`+ LOGS_DEFAULT(VERBOSE) << "lora_config: " << lora_config;`
	`1208`	`+ ORT_RETURN_IF_ERROR(qnn_backend_manager_->ParseLoraConfig(lora_config));`
	`1209`	`+ }`
	`1210`	`+`
`1205`	`1211`	`return Status::OK();`
`1206`	`1212`	`}`
`1207`	`1213`