[NPU] Acknowledge OV model compression in weightless compilation (#34609)

razvanapetroaie · andrey-golubev · web-flow · commit 9e83aea18d4e · 2026-03-10T18:23:52.000Z
### Details: - OV model compression may produce models with multiple constants pointing to the same binary data. If this is the case, rely on compiler to provide actual constant description (shape and precision), keeping the plugin code simple. Additionally, introduce several checks to ensure that the buffer sizes are the same. They are relatively cheap computation-wise, yet allow to ensure model invariants. - Changes and description by @andrey-golubev, small addition by me. ### Tickets: - *EISW-204968* ### AI Assistance: - *AI assistance used: no* - *If yes, summarize how AI was used and what human validation was performed (build/tests/manual checks).* --------- Co-authored-by: Golubev, Andrey <andrey.golubev@intel.com>
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/model_serializer.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/model_serializer.cpp
@@ -185,7 +185,9 @@ void storeWeightsPointerAttribute(const std::shared_ptr<ov::Model>& model) {
  * @param model Both source and target.
  */
 void storeWeightlessCacheAttribute(const std::shared_ptr<ov::Model>& model) {
+    std::unordered_map<size_t, size_t> wca_offset_to_size;
     size_t constantId = 0;
+
     for (auto&& node : model->get_ordered_ops()) {
         if (ov::is_type<ov::op::v0::Constant>(node)) {
             ov::RTMap& runtimeInfoMap = node->get_rt_info();
@@ -195,6 +197,16 @@ void storeWeightlessCacheAttribute(const std::shared_ptr<ov::Model>& model) {
             const std::string constantIdString = std::to_string(constantId++);
             if (weightlessCacheAttrIt != runtimeInfoMap.end()) {
                 auto& weightlessCacheAttr = weightlessCacheAttrIt->second.as<ov::WeightlessCacheAttribute>();
+
+                if (!wca_offset_to_size.count(weightlessCacheAttr.bin_offset)) {
+                    wca_offset_to_size[weightlessCacheAttr.bin_offset] = weightlessCacheAttr.original_size;
+                } else {
+                    OPENVINO_ASSERT(
+                        wca_offset_to_size.at(weightlessCacheAttr.bin_offset) == weightlessCacheAttr.original_size,
+                        "The WeightlessCacheAttribute of at least two Constant nodes use the same offset, but "
+                        "different sizes");
+                }
+
                 model->set_rt_info(weightlessCacheAttr.bin_offset, "ws_bin_offset_" + constantIdString);
                 model->set_rt_info(weightlessCacheAttr.original_size, "ws_original_size_" + constantIdString);
                 model->set_rt_info(weightlessCacheAttr.original_dtype, "ws_original_dtype_" + constantIdString);
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/weightless_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/weightless_graph.cpp
@@ -41,7 +41,17 @@ std::unordered_map<size_t, std::shared_ptr<ov::op::v0::Constant>> get_all_consta
         const auto& weightlessCacheAttrIt = runtimeInfoMap.find(ov::WeightlessCacheAttribute::get_type_info_static());
         if (weightlessCacheAttrIt != runtimeInfoMap.end()) {
             auto& weightlessCacheAttr = weightlessCacheAttrIt->second.as<ov::WeightlessCacheAttribute>();
-            constants[weightlessCacheAttr.bin_offset] = constantNode;
+
+            auto& constant = constants[weightlessCacheAttr.bin_offset];
+            if (constant != nullptr) {
+                // if multiple constants point to the same buffer, ensure that
+                // their binary sizes are the same
+                OPENVINO_ASSERT(constant->get_byte_size() == constantNode->get_byte_size(),
+                                "Found ov::Constant that points to the common buffer but has mismatching byte size. "
+                                "This may indicate a bug in OV model compression.");
+                continue;
+            }
+            constant = constantNode;
         }
     }
 
@@ -381,23 +391,33 @@ WeightlessGraph::InputData WeightlessGraph::allocate_inputs(
         auto currentInputBufferLocation =
             static_cast<unsigned char*>(const_cast<void*>(initInputsAllocatedTensor->data(ov::element::Type_t::u8))) +
             offset;
+        const auto tensorShapeFromCompiler = descriptor.shapeFromCompiler.to_shape();
         const size_t currentInputSize =
-            ov::util::get_memory_size(descriptor.precision, shape_size(descriptor.shapeFromCompiler.to_shape()));
+            ov::util::get_memory_size(descriptor.precision, shape_size(tensorShapeFromCompiler));
 
-        std::shared_ptr<ov::op::v0::Constant> constant;
         const size_t id = std::stoi(descriptor.nameFromCompiler);
-        OPENVINO_ASSERT(constants.count(id) > 0,
+        auto constantIt = constants.find(id);
+        OPENVINO_ASSERT(constantIt != constants.end(),
                         "Weights ID ",
                         id,
                         " not found in the model constants. This may indicate a mismatch between the model and the "
                         "metadata of the compiled model.");
 
-        constant = constants.at(id);
-
+        const auto constant = constantIt->second;
+        OPENVINO_ASSERT(constant->get_byte_size() == currentInputSize,
+                        "Binary size mismatch found for weights ID ",
+                        id,
+                        " between the model and compiled metadata.");
         std::memcpy(currentInputBufferLocation, constant->get_data_ptr(), currentInputSize);
 
+        // Note: Use compiler-provided precision and shape, because duplicates -
+        // constants that point to the same binary data - can in theory have
+        // different shape or even type (OV model compression only guarantees
+        // that the data is the same). In order to avoid any potential issues
+        // due to shape/type mismatches, init tensors should align with
+        // compiler's expectations.
         initInputsViewTensors.push_back(
-            ov::make_tensor(constant->get_element_type(), constant->get_shape(), currentInputBufferLocation));
+            ov::make_tensor(descriptor.precision, tensorShapeFromCompiler, currentInputBufferLocation));
         offset += currentInputSize;
 
         // Note: By construction of the weight schedule, every constant from OV