update: Refactor, fix & enable EPContext Import for XML & BIN

ankitm3k · ankitm3k · commit c2791bc03f32 · 2025-04-04T21:22:38.000+05:30
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -85,13 +85,14 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
                                  (session_context_.OpenVINO_Version.at(0) >= 2024 &&
                                   session_context_.OpenVINO_Version.at(1) > 2));
-    if (subgraph_context_.is_ep_ctx_graph) {
+    if (subgraph_context_.is_ep_ctx_graph && enable_causallm) {
       // If the blob is held in an EPContext node, then skip FE+Compile
       // and directly move on to creating a backend with the executable blob
       exe_network_ = OVCore::Get()->ImportModel(*model_stream,
                                                 hw_target,
                                                 device_config,
-                                                subgraph_context_.subgraph_name);
+                                                enable_causallm,
+                                                session_context_.onnx_model_path_name.string());
       model_stream.reset();  // Delete stream after it is no longer needed
     } else if (!session_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
@@ -285,7 +286,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
       //// Parse to get the device mode (e.g., "AUTO:CPU,GPU" -> "AUTO")
       std::unordered_set<std::string> supported_mode = {"AUTO", "HETERO", "MULTI"};
       auto device_mode = find_device_type_mode(session_context_.device_type);
-      ORT_ENFORCE(supported_mode.find(device_mode)!=supported_mode.end(), " Invalid device mode is passed : " , session_context_.device_type);
+      ORT_ENFORCE(supported_mode.find(device_mode) != supported_mode.end(), " Invalid device mode is passed : ", session_context_.device_type);
       // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
       auto individual_devices = parse_individual_devices(session_context_.device_type);
       if (!device_mode.empty()) individual_devices.emplace_back(device_mode);
@@ -379,7 +380,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       // for the stateful PoC, the ONNX model will have KV cache (past/present) tensors, but
       // we internally converted it to stateful, which removed these. So, we just continue here
       // to avoid runtime exception.
-      if (input_name.empty()) continue;
+      if (input_name.empty() || input_name == "beam_idx") continue;
 
       ORT_ENFORCE(!input_name.empty(), log_tag,
                   "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -96,7 +96,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, s
         // If idx is 0, maybe index is not set (e.g. GPU)
         // Then the device is found if we have at least one device of the type
         if (device_idx == 0 && available_devices.size() >= 1) {
-            device_found = true;
+          device_found = true;
         } else {
           // Find full device (e.g GPU.1) in the list
           if (std::find(std::begin(available_devices), std::end(available_devices), device) != std::end(available_devices))
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -33,7 +33,6 @@ void ParseConfigOptions(ProviderInfo& pi) {
     map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum";
     pi.load_config["NPU"] = std::move(map);
   }
-
 }
 
 void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -74,61 +74,72 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::str
   }
 }
 
-OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                                  std::string& hw_target,
-                                  ov::AnyMap& device_config,
-                                  bool enable_causallm,
-                                  const std::string& name) {
-  ov::CompiledModel obj;
-  try {
-    if (enable_causallm) {
-      ov::AnyMap config;
+OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
+                                          std::string& hw_target,
+                                          const ov::AnyMap& device_config) {
+  ov::CompiledModel compiled_model;
+  ov::AnyMap config = device_config;
 
-      // Create a clone of ie_cnn_network, since it's a const ov::Model, and we need to patch it..
-      //  Note! With this default path, the model runs but produces garbage (for NPUW). For CPU it's fine.
-      auto mutable_model = ie_cnn_network->clone();
+  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+    std::cout << "Stateless OV Model Statistic:" << std::endl;
+    LogBasicModelInfo(model);
+  }
 
-      if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-        std::cout << "Stateless OV Model Statistic" << std::endl;
-        LogBasicModelInfo(mutable_model);
-      }
-      LogBasicModelInfo(mutable_model);
+  LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
+  bool status = IsStateful(model);
+  std::cout << "IsStateful Status:\t" << status << std::endl;
+  if (!status) {
+    PatchStatefulDecoder(model);
+  }
 
-      LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-      PatchStatefulDecoder(mutable_model);
+  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+    std::cout << "Stateful OV Model Statistic:" << std::endl;
+    LogBasicModelInfo(model);
+  }
 
-      if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-        std::cout << "Stateful OV Model Statistic" << std::endl;
-        LogBasicModelInfo(mutable_model);
-      }
+  auto kv_pos = GetKVAxesPos(model);
+  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+    std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+    std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+  }
 
-      // This patches the model so that it only produces the logits required for sampling.
-      // Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
-      // here mostly to align this behavior for other devices (CPU, GPU).
-      ApplySliceBeforeMatmulTransformation(mutable_model);
+  if (hw_target.find("NPU") != std::string::npos) {
+    KVDesc kv_desc;
+    kv_desc.max_prompt_len = PopIntAndCast(config, "MAX_PROMPT_LEN").value_or(1024u);
+    kv_desc.min_response_len = PopIntAndCast(config, "MIN_RESPONSE_LEN").value_or(128u);
 
-      auto kv_pos = GetKVAxesPos(mutable_model);
-      if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-        std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
-        std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
-      }
+    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
+      std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+    }
 
-      if (hw_target.find("NPU") != std::string::npos) {
-        KVDesc kv_desc;
-        kv_desc.max_prompt_len = PopIntAndCast(device_config, "MAX_PROMPT_LEN").value_or(1024u);
-        kv_desc.min_response_len = PopIntAndCast(device_config, "MIN_RESPONSE_LEN").value_or(128u);
+    UpdateNPUConfig(config, kv_pos, kv_desc);
+  } else {
+    // This patches the model so that it only produces the logits required for sampling.
+    // Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
+    // here mostly to align this behavior for other devices (CPU, GPU).
+    ApplySliceBeforeMatmulTransformation(model);
+  }
 
-        if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-          std::cout << "kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
-          std::cout << "kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
-        }
+  std::cout << "Compiling Stateful OV Model ..." << std::endl;
+  compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
+  std::cout << "Stateful OV Model Compilation Complete" << std::endl;
 
-        UpdateNPUConfig(config, kv_pos, kv_desc);
-      }
+  OVExeNetwork exe(compiled_model);
+  return exe;
+}
 
-      std::cout << "Compiling Stateful OV Model..." << std::endl;
-      obj = core.compile_model(mutable_model, hw_target, config);
-      std::cout << "Stateful OV Model Compilation Complete" << std::endl;
+OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
+                                  std::string& hw_target,
+                                  ov::AnyMap& device_config,
+                                  bool enable_causallm,
+                                  const std::string& name) {
+  ov::CompiledModel obj;
+  try {
+    if (enable_causallm) {
+      auto mutable_model = ie_cnn_network->clone();
+      auto compiled_model = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
+      obj = compiled_model.Get();
     } else {
       obj = core.compile_model(ie_cnn_network, hw_target, device_config);
     }
@@ -166,10 +177,68 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
 OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
+                                 bool enable_causallm,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    obj = core.import_model(model_stream, hw_target, device_config);
+
+    // Check if it's XML
+    std::streampos originalPos = model_stream.tellg();
+    // Allocate space for "<?xml"
+    std::string header(5, '\0');
+    model_stream.read(&header[0], 5);
+
+    // Clear any read errors
+    model_stream.clear();
+    // Restore the stream position (important for reusing the stream)
+    model_stream.seekg(originalPos);
+
+    if (header != "<?xml") {
+      obj = core.import_model(model_stream, hw_target, device_config);
+    } else {
+      // Get path to bin file
+      std::string bin_file;
+      if (name.size() >= 5 && name.substr(name.size() - 5) == ".onnx") {
+        bin_file = name;
+        bin_file.replace(name.size() - 5, 5, ".bin");
+      } else {
+        throw std::runtime_error("Invalid model name. Make sure *.onnx, *.xml, and *.bin carry the same name.");
+      }
+
+      // Read the model XML into a string
+      std::stringstream xml_stream;
+      xml_stream << model_stream.rdbuf();
+      std::string xml_content = xml_stream.str();
+
+      // Read model.bin into a vector
+      std::ifstream bin_stream;
+      bin_stream.open(bin_file, std::ios::binary);
+      if (!bin_stream.is_open()) {
+        throw std::runtime_error("Failed to open " + bin_file);
+      }
+
+      bin_stream.seekg(0, std::ios::end);
+      std::streamsize size = bin_stream.tellg();
+      bin_stream.seekg(0, std::ios::beg);
+      std::vector<uint8_t> bin_data(size);
+      if (!bin_stream.read(reinterpret_cast<char*>(bin_data.data()), size)) {
+        throw std::runtime_error("Failed to read binary data from " + bin_file);
+      }
+
+      // Create an ov::Tensor for weights
+      ov::Tensor weights_tensor(ov::element::u8, {bin_data.size()}, bin_data.data());
+
+      // Load the model explicitly with XML content and weights
+      std::shared_ptr<ov::Model> model = core.read_model(xml_content, weights_tensor);
+
+      if (enable_causallm) {
+        auto compiled_model = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
+        obj = compiled_model.Get();
+      } else {
+        obj = core.compile_model(model, hw_target, device_config);
+      }
+    }
+
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -69,6 +69,9 @@ struct OVCore : WeakSingleton<OVCore> {
   // OV Interface For Reading Model
   std::shared_ptr<OVNetwork> ReadModel(std::string&& model_stream, const std::string& model_path);
 
+  OVExeNetwork StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
+                                    std::string& hw_target,
+                                    const ov::AnyMap& device_config);
   // OV Interface for Compiling OV Model Type
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
                             std::string& hw_target,
@@ -84,6 +87,7 @@ struct OVCore : WeakSingleton<OVCore> {
   OVExeNetwork ImportModel(std::istream& model_stream,
                            std::string hw_target,
                            const ov::AnyMap& device_config,
+                           bool enable_causallm,
                            std::string name);
 #ifdef IO_BUFFER_ENABLED
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -6,8 +6,7 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-
-  void LogBasicModelInfo(const std::shared_ptr<const ov::Model>& model) {
+void LogBasicModelInfo(const std::shared_ptr<const ov::Model>& model) {
   std::cout << "Model Name: " << model->get_friendly_name() << std::endl;
 
   // Dump information about model inputs/outputs
@@ -37,7 +36,7 @@ namespace openvino_ep {
   return;
 }
 
-  bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::string& name_to_match) {
+bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::string& name_to_match) {
   for (const ov::Output<ov::Node>& input : model->inputs()) {
     auto& names = input.get_names();
 
@@ -60,10 +59,10 @@ namespace openvino_ep {
   return false;
 }
 
- void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
-                               std::vector<std::string>& not_kv_inputs,
-                               const std::vector<std::string>& key_value_input_names,
-                               int gather_dim) {
+void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
+                      std::vector<std::string>& not_kv_inputs,
+                      const std::vector<std::string>& key_value_input_names,
+                      int gather_dim) {
   if (ModelHasInputOutputNames(ov_model, "beam_idx")) {
     throw std::runtime_error("Model already has fused cache");
   }
@@ -101,9 +100,9 @@ namespace openvino_ep {
   ov_model->validate_nodes_and_infer_types();
 }
 
- void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
-                          const std::vector<std::string>& key_value_input_names,
-                          const std::vector<std::string>& key_value_output_names) {
+void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
+                  const std::vector<std::string>& key_value_input_names,
+                  const std::vector<std::string>& key_value_output_names) {
   std::map<std::string, std::string> input_output_map;
 
   // Create mapping for KV-cache inputs and outputs
@@ -119,7 +118,7 @@ namespace openvino_ep {
 
 // Converted to C++ from here:
 // https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/stateful.py#L281
- void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
   std::vector<std::string> key_value_input_names;
   std::vector<std::string> not_kv_inputs;
   for (const ov::Output<ov::Node>& input : model->inputs()) {
@@ -166,7 +165,7 @@ namespace openvino_ep {
 }
 
 // Some other utility functions copied from OpenVINO GenAI
- bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name) {
+bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name) {
   for (const auto& op : function->get_ops()) {
     if (op->get_type_name() == type_name) {
       return true;
@@ -175,7 +174,7 @@ namespace openvino_ep {
   return false;
 }
 
- std::tuple<std::shared_ptr<ov::Node>, int64_t> FindLLMMatmul(const std::shared_ptr<ov::Model>& model) {
+std::tuple<std::shared_ptr<ov::Node>, int64_t> FindLLMMatmul(const std::shared_ptr<ov::Model>& model) {
   auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
   std::shared_ptr<ov::Node> matmul = ov::as_type_ptr<ov::op::v0::MatMul>(last_node);
 
@@ -206,7 +205,7 @@ namespace openvino_ep {
   return std::make_tuple(matmul, slice_gather_dim);
 }
 
- void ApplySliceBeforeMatmulTransformation(std::shared_ptr<ov::Model> model) {
+void ApplySliceBeforeMatmulTransformation(std::shared_ptr<ov::Model> model) {
   std::shared_ptr<ov::Node> matmul = nullptr;
   int64_t slice_gather_dim = -1;
   std::tie(matmul, slice_gather_dim) = FindLLMMatmul(model);
@@ -221,13 +220,13 @@ namespace openvino_ep {
   }
 }
 
- void UpdateConfig(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair) {
+void UpdateConfig(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair) {
   if (config.count(pair.first) == 0) {
     config.insert(pair);
   }
 }
 
- std::optional<ov::Any> PopOption(ov::AnyMap& config, const std::string& option_name) {
+std::optional<ov::Any> PopOption(ov::AnyMap& config, const std::string& option_name) {
   if (auto it = config.find(option_name); it != config.end()) {
     std::optional<ov::Any> found = std::make_optional(it->second);
     config.erase(it);
@@ -236,14 +235,13 @@ namespace openvino_ep {
   return std::nullopt;
 }
 
- void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) {
+void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) {
   if (config.count(old_key) != 0) {
     auto opt_value = PopOption(config, old_key);
     config[new_key] = opt_value.value();
   }
 }
 
-
 KVAxesPosition GetKVAxesPos(std::shared_ptr<const ov::Model> model) {
   // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
   // therefore usually seq_length_axis = 2 and batch = 0
@@ -324,6 +322,17 @@ std::optional<uint32_t> PopIntAndCast(ov::AnyMap& config, const std::string& key
   return std::nullopt;
 }
 
+bool IsStateful(const std::shared_ptr<ov::Model>& model) {
+  for (auto&& ptr : model->get_ordered_ops()) {
+    if (ov::is_type<ov::op::v3::ReadValue>(ptr) ||
+        ov::is_type<ov::op::v6::ReadValue>(ptr) ||
+        ov::is_type<ov::op::v3::Assign>(ptr) ||
+        ov::is_type<ov::op::v6::Assign>(ptr)) {
+      return true;
+    }
+  }
+  return false;
+}
 
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -62,5 +62,7 @@ void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVD
 std::optional<ov::Any> PopOptionNew(ov::AnyMap& config, const std::string& option_name);
 std::optional<uint32_t> PopIntAndCast(ov::AnyMap& config, const std::string& key);
 
+bool IsStateful(const std::shared_ptr<ov::Model>& model);
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,6 @@ void ParseConfigOptions(ProviderInfo& pi) {`
`33`	`33`	`map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum";`
`34`	`34`	`pi.load_config["NPU"] = std::move(map);`
`35`	`35`	`}`
`36`		`-`
`37`	`36`	`}`
`38`	`37`
`39`	`38`	`void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) {`