Skip to content

Commit 2ac10fd

Browse files
committed
[QNN-EP] Add LoraV2 Support with offline QNN context binary
Description - Add the new run option called lora_config to feed the information from lora binary - Parse and apply the lora binary in OnRunStart Motivation and Context - Support Lora Adapter Binary with QNN Context Binary Usage
1 parent fe7634e commit 2ac10fd

File tree

4 files changed

+75
-0
lines changed

4 files changed

+75
-0
lines changed

include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_
4343
// Set RPC control latency for QNN HTP backend
4444
static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
4545

46+
// Set QNN Lora Config File for apply Lora in QNN context binary
47+
static const char* const kOrtRunOptionsConfigQnnLoraConfig = "qnn.lora_config";
48+
4649
// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
4750
// The value should be an integer. If the value is not set, the default value is 0 and
4851
// ORT session only captures one cuda graph before another capture is requested.

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,70 @@ static const char* DlError() {
5252
#endif
5353
}
5454

55+
Status ReadBinaryFromFile(const std::string& file_path, uint8_t* buffer, size_t buffer_size) {
56+
ORT_RETURN_IF(nullptr == buffer, "Binary buffer is nullptr");
57+
std::ifstream in(file_path, std::ifstream::binary);
58+
ORT_RETURN_IF(!in, "Failed to open input file: ", file_path.c_str());
59+
ORT_RETURN_IF(!in.read(reinterpret_cast<char*>(buffer), buffer_size), "Failed to read the contents of: ", file_path.c_str());
60+
return Status::OK();
61+
}
62+
63+
Status QnnBackendManager::ParseLoraConfig(std::string lora_config_path) {
64+
LOGS_DEFAULT(INFO) << "Acquiring the QnnInterface " << lora_config_path;
65+
66+
// QNN Lora Config file format should be a single line, with the graph name first,
67+
// followed by the qnn lora context binary path, separated by a semicolon (;)
68+
// Example: <graph_name>;<binary_path>
69+
LOGS_DEFAULT(INFO) << "Loading Lora Config " << lora_config_path;
70+
std::ifstream file(lora_config_path);
71+
std::string line;
72+
73+
if (file.is_open()) {
74+
if (std::getline(file, line)) {
75+
std::istringstream ss(line);
76+
std::string graph_name;
77+
std::string lora_adapter_bin_path;
78+
79+
if (std::getline(ss, graph_name, ';') && std::getline(ss, lora_adapter_bin_path)) {
80+
size_t buffer_size = std::filesystem::file_size(lora_adapter_bin_path.c_str());
81+
82+
ORT_RETURN_IF(0 == buffer_size, "Received path to an empty file. Nothing to deserialize.");
83+
std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(buffer_size);
84+
void* voidBufferPtr = static_cast<void*>(buffer.get());
85+
QnnContext_Buffer_t contextBuffer{QNN_CONTEXT_BUFFER_VERSION_1,
86+
{QNN_CONTEXTMEMTYPE_RAW, {{voidBufferPtr, buffer_size}}}};
87+
88+
auto status = ReadBinaryFromFile(lora_adapter_bin_path,
89+
reinterpret_cast<uint8_t*>(buffer.get()),
90+
buffer_size);
91+
92+
ORT_RETURN_IF(status != Status::OK(), "Failed to read binary data.");
93+
Qnn_GraphHandle_t graph;
94+
bool graph_retrieve_success = false;
95+
for (size_t cIdx = 0; cIdx < contexts_.size(); cIdx++) {
96+
auto graph_retrieve_rt = qnn_interface_.graphRetrieve(contexts_[cIdx], graph_name.c_str(), &graph);
97+
if (QNN_SUCCESS != graph_retrieve_rt) {
98+
continue;
99+
}
100+
101+
graph_retrieve_success = true;
102+
103+
auto context_apply_binary_section_rt = qnn_interface_.contextApplyBinarySection(
104+
contexts_[cIdx], graph, QNN_CONTEXT_SECTION_UPDATABLE, &contextBuffer, profile_backend_handle_, nullptr);
105+
ORT_RETURN_IF(QNN_SUCCESS != context_apply_binary_section_rt, "Failed to apply binary section.");
106+
break;
107+
}
108+
ORT_RETURN_IF_NOT(graph_retrieve_success, "Failed to retrieve graph: ", graph_name, " and apply binary section.");
109+
}
110+
}
111+
file.close();
112+
} else {
113+
LOGS_DEFAULT(ERROR) << "Unable to load Lora Config " << lora_config_path;
114+
}
115+
116+
return Status::OK();
117+
}
118+
55119
template <typename F, class T>
56120
Status QnnBackendManager::GetQnnInterfaceProvider(const char* lib_path,
57121
const char* interface_provider_name,

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
140140
const Qnn_Tensor_t& qnn_tensor,
141141
Qnn_MemHandle_t& mem_handle);
142142

143+
Status ParseLoraConfig(std::string lora_config);
144+
143145
private:
144146
Status LoadBackend();
145147

onnxruntime/core/providers/qnn/qnn_execution_provider.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,12 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio
12021202
}
12031203
}
12041204

1205+
std::string lora_config = "";
1206+
if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnLoraConfig, lora_config)) {
1207+
LOGS_DEFAULT(VERBOSE) << "lora_config: " << lora_config;
1208+
ORT_RETURN_IF_ERROR(qnn_backend_manager_->ParseLoraConfig(lora_config));
1209+
}
1210+
12051211
return Status::OK();
12061212
}
12071213

0 commit comments

Comments
 (0)