@@ -74,61 +74,72 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::str
7474 }
7575}
7676
77- OVExeNetwork OVCore::CompileModel (std::shared_ptr<const OVNetwork>& ie_cnn_network,
78- std::string& hw_target,
79- ov::AnyMap& device_config,
80- bool enable_causallm,
81- const std::string& name) {
82- ov::CompiledModel obj;
83- try {
84- if (enable_causallm) {
85- ov::AnyMap config;
77+ OVExeNetwork OVCore::StatefulCompileModel (std::shared_ptr<OVNetwork>& model,
78+ std::string& hw_target,
79+ const ov::AnyMap& device_config) {
80+ ov::CompiledModel compiled_model;
81+ ov::AnyMap config = device_config;
8682
87- // Create a clone of ie_cnn_network, since it's a const ov::Model, and we need to patch it..
88- // Note! With this default path, the model runs but produces garbage (for NPUW). For CPU it's fine.
89- auto mutable_model = ie_cnn_network->clone ();
83+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
84+ std::cout << " Stateless OV Model Statistic:" << std::endl;
85+ LogBasicModelInfo (model);
86+ }
9087
91- if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
92- std::cout << " Stateless OV Model Statistic" << std::endl;
93- LogBasicModelInfo (mutable_model);
94- }
95- LogBasicModelInfo (mutable_model);
88+ LOGS_DEFAULT (INFO) << log_tag << " Converting from Stateless OV Model to Stateful OV Model" << std::endl;
89+ bool status = IsStateful (model);
90+ std::cout << " IsStateful Status:\t " << status << std::endl;
91+ if (!status) {
92+ PatchStatefulDecoder (model);
93+ }
9694
97- LOGS_DEFAULT (INFO) << log_tag << " Converting from Stateless OV Model to Stateful OV Model" << std::endl;
98- PatchStatefulDecoder (mutable_model);
95+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
96+ std::cout << " Stateful OV Model Statistic:" << std::endl;
97+ LogBasicModelInfo (model);
98+ }
9999
100- if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
101- std::cout << " Stateful OV Model Statistic" << std::endl;
102- LogBasicModelInfo (mutable_model);
103- }
100+ auto kv_pos = GetKVAxesPos (model);
101+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
102+ std::cout << " kv_pos.batch = " << kv_pos.batch << std::endl;
103+ std::cout << " kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
104+ }
104105
105- // This patches the model so that it only produces the logits required for sampling.
106- // Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
107- // here mostly to align this behavior for other devices (CPU, GPU).
108- ApplySliceBeforeMatmulTransformation (mutable_model );
106+ if (hw_target. find ( " NPU " ) != std::string::npos) {
107+ KVDesc kv_desc;
108+ kv_desc. max_prompt_len = PopIntAndCast (config, " MAX_PROMPT_LEN " ). value_or ( 1024u );
109+ kv_desc. min_response_len = PopIntAndCast (config, " MIN_RESPONSE_LEN " ). value_or ( 128u );
109110
110- auto kv_pos = GetKVAxesPos (mutable_model);
111- if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
112- std::cout << " kv_pos.batch = " << kv_pos.batch << std::endl;
113- std::cout << " kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
114- }
111+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
112+ std::cout << " kv_desc.max_prompt_len:\t " << kv_desc.max_prompt_len << std::endl;
113+ std::cout << " kv_desc.min_response_len:\t " << kv_desc.min_response_len << std::endl;
114+ }
115115
116- if (hw_target.find (" NPU" ) != std::string::npos) {
117- KVDesc kv_desc;
118- kv_desc.max_prompt_len = PopIntAndCast (device_config, " MAX_PROMPT_LEN" ).value_or (1024u );
119- kv_desc.min_response_len = PopIntAndCast (device_config, " MIN_RESPONSE_LEN" ).value_or (128u );
116+ UpdateNPUConfig (config, kv_pos, kv_desc);
117+ } else {
118+ // This patches the model so that it only produces the logits required for sampling.
119+ // Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
120+ // here mostly to align this behavior for other devices (CPU, GPU).
121+ ApplySliceBeforeMatmulTransformation (model);
122+ }
120123
121- if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
122- std::cout << " kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
123- std::cout << " kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
124- }
124+ std::cout << " Compiling Stateful OV Model ..." << std::endl;
125+ compiled_model = OVCore::Get ()->core .compile_model (model, hw_target, config);
126+ std::cout << " Stateful OV Model Compilation Complete" << std::endl;
125127
126- UpdateNPUConfig (config, kv_pos, kv_desc);
127- }
128+ OVExeNetwork exe (compiled_model);
129+ return exe;
130+ }
128131
129- std::cout << " Compiling Stateful OV Model..." << std::endl;
130- obj = core.compile_model (mutable_model, hw_target, config);
131- std::cout << " Stateful OV Model Compilation Complete" << std::endl;
132+ OVExeNetwork OVCore::CompileModel (std::shared_ptr<const OVNetwork>& ie_cnn_network,
133+ std::string& hw_target,
134+ ov::AnyMap& device_config,
135+ bool enable_causallm,
136+ const std::string& name) {
137+ ov::CompiledModel obj;
138+ try {
139+ if (enable_causallm) {
140+ auto mutable_model = ie_cnn_network->clone ();
141+ auto compiled_model = OVCore::Get ()->StatefulCompileModel (mutable_model, hw_target, device_config);
142+ obj = compiled_model.Get ();
132143 } else {
133144 obj = core.compile_model (ie_cnn_network, hw_target, device_config);
134145 }
@@ -166,10 +177,68 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
166177OVExeNetwork OVCore::ImportModel (std::istream& model_stream,
167178 std::string hw_target,
168179 const ov::AnyMap& device_config,
180+ bool enable_causallm,
169181 std::string name) {
170182 try {
171183 ov::CompiledModel obj;
172- obj = core.import_model (model_stream, hw_target, device_config);
184+
185+ // Check if it's XML
186+ std::streampos originalPos = model_stream.tellg ();
187+ // Allocate space for "<?xml"
188+ std::string header (5 , ' \0 ' );
189+ model_stream.read (&header[0 ], 5 );
190+
191+ // Clear any read errors
192+ model_stream.clear ();
193+ // Restore the stream position (important for reusing the stream)
194+ model_stream.seekg (originalPos);
195+
196+ if (header != " <?xml" ) {
197+ obj = core.import_model (model_stream, hw_target, device_config);
198+ } else {
199+ // Get path to bin file
200+ std::string bin_file;
201+ if (name.size () >= 5 && name.substr (name.size () - 5 ) == " .onnx" ) {
202+ bin_file = name;
203+ bin_file.replace (name.size () - 5 , 5 , " .bin" );
204+ } else {
205+ throw std::runtime_error (" Invalid model name. Make sure *.onnx, *.xml, and *.bin carry the same name." );
206+ }
207+
208+ // Read the model XML into a string
209+ std::stringstream xml_stream;
210+ xml_stream << model_stream.rdbuf ();
211+ std::string xml_content = xml_stream.str ();
212+
213+ // Read model.bin into a vector
214+ std::ifstream bin_stream;
215+ bin_stream.open (bin_file, std::ios::binary);
216+ if (!bin_stream.is_open ()) {
217+ throw std::runtime_error (" Failed to open " + bin_file);
218+ }
219+
220+ bin_stream.seekg (0 , std::ios::end);
221+ std::streamsize size = bin_stream.tellg ();
222+ bin_stream.seekg (0 , std::ios::beg);
223+ std::vector<uint8_t > bin_data (size);
224+ if (!bin_stream.read (reinterpret_cast <char *>(bin_data.data ()), size)) {
225+ throw std::runtime_error (" Failed to read binary data from " + bin_file);
226+ }
227+
228+ // Create an ov::Tensor for weights
229+ ov::Tensor weights_tensor (ov::element::u8 , {bin_data.size ()}, bin_data.data ());
230+
231+ // Load the model explicitly with XML content and weights
232+ std::shared_ptr<ov::Model> model = core.read_model (xml_content, weights_tensor);
233+
234+ if (enable_causallm) {
235+ auto compiled_model = OVCore::Get ()->StatefulCompileModel (model, hw_target, device_config);
236+ obj = compiled_model.Get ();
237+ } else {
238+ obj = core.compile_model (model, hw_target, device_config);
239+ }
240+ }
241+
173242#ifndef NDEBUG
174243 printDebugInfo (obj);
175244#endif
0 commit comments