Avoid inserting Convert operations for irregular ov::Result case

nikita-kud · nikita-kud · commit cc1ac68d7271 · 2026-01-08T12:00:21.000Z
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
@@ -29,6 +29,7 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
 
     pugi::xml_node node = doc.append_child("ensemble");
     node.append_attribute("gflops") = std::to_string(ens.gflops).data();
+    node.append_attribute("irregular_results") = std::to_string(ens.irregular_results).data();
 
     pugi::xml_node part = node.append_child("partitioning");
     pugi::xml_node rep;
@@ -83,6 +84,7 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
 
     doc.save_file(to.data());
 }
+
 }  // namespace detail
 
 // Interface to get online partitioning from the model
@@ -308,6 +310,7 @@ class Compiler {
 
         ov::npuw::Ensemble ens;
         ens.gflops = 1.;  // FIXME: calculate proper flops
+        ens.irregular_results = !m_snapshot->isRegularResultCase();
 
         auto graph = m_snapshot->getGraph();
         // Iterate in topological order
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
@@ -135,6 +135,10 @@ std::shared_ptr<ov::Node> Group::getInitialNode() const {
     return *(m_content.begin());
 }
 
+const std::unordered_set<std::shared_ptr<ov::Node>>& Group::getOutputs() const {
+    return m_output_layers;
+}
+
 void Group::addInput(const std::shared_ptr<ov::Node>& node) {
     m_input_layers.insert(node);
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp
@@ -49,6 +49,7 @@ class Group : public std::enable_shared_from_this<Group> {
     own::ade::NodeHandle getHandle() const;
     // Note: can only be used during initial group initialization
     std::shared_ptr<ov::Node> getInitialNode() const;
+    const std::unordered_set<std::shared_ptr<ov::Node>>& getOutputs() const;
     void addInput(const std::shared_ptr<ov::Node>& node);
     void addOutput(const std::shared_ptr<ov::Node>& node);
     void addContent(const std::shared_ptr<ov::Node>& node);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
@@ -1270,6 +1270,101 @@ void Snapshot::stripTag(const std::string& tag) {
     }
 }
 
+bool Snapshot::isRegularResultCase() const {
+    LOG_INFO("Online partitioning: executing isRegularResultCase pass...");
+    LOG_BLOCK();
+
+    // This method works around an issue where the final partitioning fails the sanity check 
+    // because of a different number of output Convert across repeated block groups.
+    // The issue was initially observed in a model where only the final block has an additional ov::Result consumer.
+    // For example, Group[0..30] has only external consumers (i.e. consumers that belong to other groups):
+    //   OpA -> OpB(external group)
+    //       -> OpC(external group)
+    // but very last Group[31] has an additional ov::Result consumer:
+    //   OpA -> ov::Result
+    //       -> OpB(external group)
+    //       -> OpC(external group)
+    // Later, if NPUW_F16IC is set, "Partitioner::identifySubgraphs" method adds output Converts to each Group[0..30], 
+    // but skips Group[31] due to internal implementation details.
+    // "Partitioner::identifySubgraphs" can't:
+    //   - add Convert to the Group[31] because it would require adding opposite Convert for the ov::Result
+    //   - skip adding Converts to Group[0..30] because it would break symmetry of the repeated blocks, i.e. 
+    //        in the given graph `Convert(group0) -> output -> input -> Convert(group1)` input `Convert(group1)` should be also eliminated
+    // Therefore, we disable F16IC early in such cases.
+
+    using NodeSPtr = std::shared_ptr<ov::Node>;
+    std::unordered_map<std::string, NodeSPtr> node_id_cache;
+    for (auto&& node_ptr : m_model->get_ordered_ops()) {
+        node_id_cache[node_ptr->get_friendly_name()] = node_ptr;
+    }
+
+    auto getReadersMask = [](const NodeSPtr& node_ptr) {
+        // each element of the vector is 
+        // the number of ov::Result readers for the corresponding output
+        std::vector<int> mask;
+        for (auto&& output_desc : node_ptr->outputs()) {
+            auto readers = output_desc.get_target_inputs();
+            int result_count = 0;
+            for (auto&& r : readers) {
+                auto reader_node_ptr = r.get_node()->shared_from_this();
+                if (ov::op::util::is_output(reader_node_ptr)) {
+                    result_count++;
+                }
+            }
+            mask.push_back(result_count);
+        }
+        return mask;
+    };
+
+    auto reptag_to_gset = repeating();
+    if(!reptag_to_gset.empty()) {
+        NPUW_ASSERT(!m_layer_matches.empty());
+    }
+
+    for (const auto& reptag_and_gset : reptag_to_gset) {
+        auto reptag = reptag_and_gset.first;
+        auto gset = reptag_and_gset.second;
+
+        auto matches = m_layer_matches.at(reptag->id());
+
+        if(gset.size() <= 1) {
+            continue;
+        }
+
+        auto firstGroup = *(gset.begin());
+        for(auto output_layer: firstGroup->getOutputs()) {
+            // this is the reference mask expected from all other matched layers
+            // in the remaining groups of the repeated block
+            auto expected_readers_mask = getReadersMask(output_layer);
+
+            auto this_layer_name = output_layer->get_friendly_name();
+            auto layer_bank_iter =
+                std::find_if(matches.begin(), matches.end(), [&](const std::set<std::string>& lrs) {
+                    return lrs.count(this_layer_name) > 0;
+            });
+
+            NPUW_ASSERT(layer_bank_iter != matches.end());
+
+            // match output layers across all groups in the repeated block
+            // and compare their readers mask
+            for(const auto& layer_name: *layer_bank_iter) {
+                auto layer_ptr = node_id_cache.at(layer_name);
+                auto actual_readers_mask = getReadersMask(layer_ptr);
+
+                if (actual_readers_mask != expected_readers_mask) {
+                    LOG_INFO("This is NOT a regular result case. Readers mask mismatch found for " 
+                        << layer_name << " and " << this_layer_name << " output layers.");
+                    return false;
+                }
+            }
+        }
+    }
+
+    LOG_INFO("This is a regular result case");
+    LOG_INFO("DONE");
+    return true;
+}
+
 size_t Snapshot::getNextRepId() {
     return m_current_rep_count++;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
@@ -56,6 +56,9 @@ class Snapshot : public std::enable_shared_from_this<Snapshot> {
 
     void stripTag(const std::string& tag);
 
+    // Passes to detect corner cases 
+    bool isRegularResultCase() const;
+
     // Utility
     std::shared_ptr<own::ade::Graph> getGraph() const;
     const detail::OVPortsMap& getPortsMap() const;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -182,7 +182,7 @@ ov::npuw::Ensemble load_groups(const std::shared_ptr<ov::Model>& model, const st
 
     LOG_INFO("Found " << repeated.size() << " different repeated block(s)");
 
-    return ov::npuw::Ensemble{get_float_attr(root, "gflops"), std::move(partitions), std::move(repeated)};
+    return ov::npuw::Ensemble{get_float_attr(root, "gflops"), get_bool_attr(root, "irregular_results", false), std::move(partitions), std::move(repeated)};
 }
 
 class Partitioner {
@@ -376,7 +376,7 @@ void Partitioner::identifySubgraphs() {
     LOG_INFO("Identifying subgraphs for model " << model->get_friendly_name() << "...");
     LOG_BLOCK();
 
-    const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>();
+    const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>() && !ens.irregular_results;
 
     using namespace ov::npuw;
     std::vector<ov::npuw::Group>& partitions = ens.groups;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -158,6 +158,7 @@ struct RepeatedBlock {
 
 struct Ensemble {
     float gflops;
+    bool irregular_results;
     std::vector<Group> groups;
 
     // Just a map as I don't expect 100s of _different_
diff --git a/src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.cpp b/src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.cpp
@@ -39,6 +39,16 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_without_repeated_blocks() {
 }
 
 std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::size_t repetitions) {
+    return get_model_with_repeated_blocks_and_results(repetitions, {});
+}
+
+std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks() {
+    return get_model_with_repeated_blocks(10);
+}
+
+std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks_and_results(
+    std::size_t repetitions,
+    const std::vector<std::size_t>& block_indices) {
     // Generate head
     std::shared_ptr<ov::op::v0::Parameter> input = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::Shape{1, 1, 40});
     m_nodes.push_back(input);
@@ -60,17 +70,17 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::s
 
     // Generate repeated blocks
     std::shared_ptr<ov::Node> output = get_block(head[6]);
-    std::vector<std::shared_ptr<ov::Node>> outputs;
-    outputs.push_back(output);
+    std::vector<std::shared_ptr<ov::Node>> block_outputs;
+    block_outputs.push_back(output);
 
     for (size_t i = 0; i < repetitions - 1; ++i) {
         output = get_block(output);
-        outputs.push_back(output);
+        block_outputs.push_back(output);
     }
 
     // Generate tail
     std::vector<std::shared_ptr<ov::Node>> tail(6, nullptr);
-    tail[0] = std::make_shared<ov::op::v0::Concat>(outputs, -1);
+    tail[0] = std::make_shared<ov::op::v0::Concat>(block_outputs, -1);
     tail[1] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int>{1, 40, int(repetitions)});
     tail[2] = std::make_shared<ov::op::v1::Reshape>(tail[0], tail[1], false);
     tail[3] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1});
@@ -82,19 +92,119 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::s
         set_name(t);
     }
 
+    // Create Results
+    ov::ResultVector results;
+    
+    // Add Results for specified blocks
+    for (size_t idx : block_indices) {
+        if (idx < block_outputs.size()) {
+            auto result = std::make_shared<ov::op::v0::Result>(block_outputs[idx]);
+            m_nodes.push_back(result);
+            set_name(result);
+            results.push_back(result);
+        }
+    }
+
     // Create model
-    auto result = std::make_shared<ov::op::v0::Result>(tail[5]);
-    m_nodes.push_back(result);
-    set_name(result);
+    // Always add final tail Result
+    auto final_result = std::make_shared<ov::op::v0::Result>(tail[5]);
+    m_nodes.push_back(final_result);
+    set_name(final_result);
+    results.push_back(final_result);
 
     ov::ParameterVector params = {input};
-    ov::ResultVector results = {result};
 
     return std::make_shared<ov::Model>(results, params);
 }
 
-std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks() {
-    return get_model_with_repeated_blocks(10);
+std::shared_ptr<ov::Model> ModelGenerator::get_model_with_multi_output_repeating_blocks(
+    std::size_t repetitions,
+    bool last_block_has_direct_result) {
+    if (repetitions == 0) {
+        repetitions = 1;  // keep the model non-empty
+    }
+
+    auto input = std::make_shared<ov::opset11::Parameter>(ov::element::f32, ov::Shape{1, 1, 8});
+    m_nodes.push_back(input);
+    set_name(input);
+
+    // Shared constants
+    auto add_const = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {1.f});
+    auto k_const = ov::opset11::Constant::create(ov::element::i64, ov::Shape{}, {8});
+    auto seed_indices = ov::opset11::Constant::create(ov::element::i32,
+                                                      ov::Shape{1, 1, 8},
+                                                      {0, 1, 2, 3, 4, 5, 6, 7});
+    auto tail_scale = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {0.5f});
+    auto tail_bias = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {2.f});
+
+    for (const auto& c : {add_const, k_const, seed_indices, tail_scale, tail_bias}) {
+        m_nodes.push_back(c);
+        set_name(c);
+    }
+
+    ov::Output<ov::Node> current_values = input;
+    ov::Output<ov::Node> current_indices = seed_indices;
+
+    for (std::size_t i = 0; i < repetitions; ++i) {
+        // Build block body; TopK remains the final op of each block to expose multiple outputs
+        auto indices_as_float = std::make_shared<ov::opset11::Convert>(current_indices, ov::element::f32);
+        m_nodes.push_back(indices_as_float);
+        set_name(indices_as_float);
+
+        auto mixed = std::make_shared<ov::opset11::Add>(current_values, indices_as_float);
+        m_nodes.push_back(mixed);
+        set_name(mixed);
+
+        auto shifted = std::make_shared<ov::opset11::Add>(mixed, add_const);
+        m_nodes.push_back(shifted);
+        set_name(shifted);
+
+        auto topk = std::make_shared<ov::opset11::TopK>(shifted,
+                                                         k_const,
+                                                         -1,
+                                                         ov::op::TopKMode::MAX,
+                                                         ov::op::TopKSortType::SORT_VALUES,
+                                                         ov::element::i32);
+        m_nodes.push_back(topk);
+        set_name(topk);
+
+        current_values = topk->output(0);
+        current_indices = topk->output(1);
+    }
+
+    // Tail consumes the final block outputs
+    auto tail_indices_as_float = std::make_shared<ov::opset11::Convert>(current_indices, ov::element::f32);
+    m_nodes.push_back(tail_indices_as_float);
+    set_name(tail_indices_as_float);
+
+    auto tail_mixed = std::make_shared<ov::opset11::Add>(current_values, tail_indices_as_float);
+    m_nodes.push_back(tail_mixed);
+    set_name(tail_mixed);
+
+    auto tail_mul = std::make_shared<ov::opset11::Multiply>(tail_mixed, tail_scale);
+    m_nodes.push_back(tail_mul);
+    set_name(tail_mul);
+
+    auto tail_add = std::make_shared<ov::opset11::Add>(tail_mul, tail_bias);
+    m_nodes.push_back(tail_add);
+    set_name(tail_add);
+
+    ov::ResultVector results;
+    auto tail_result = std::make_shared<ov::opset11::Result>(tail_add);
+    m_nodes.push_back(tail_result);
+    set_name(tail_result);
+    results.push_back(tail_result);
+
+    if (last_block_has_direct_result) {
+        auto direct_result = std::make_shared<ov::opset11::Result>(current_values);
+        m_nodes.push_back(direct_result);
+        set_name(direct_result);
+        results.push_back(direct_result);
+    }
+
+    ov::ParameterVector params = {input};
+
+    return std::make_shared<ov::Model>(results, params);
 }
 
 std::shared_ptr<ov::Node> ModelGenerator::get_block(const std::shared_ptr<ov::Node>& input) {
diff --git a/src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.hpp b/src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.hpp
@@ -14,6 +14,23 @@ class ModelGenerator {
     std::shared_ptr<ov::Model> get_model_without_repeated_blocks();
     std::shared_ptr<ov::Model> get_model_with_repeated_blocks(std::size_t repetitions);
     std::shared_ptr<ov::Model> get_model_with_repeated_blocks();
+    
+    // Build model with repeating blocks and configurable ov::Result consumers:
+    //   - repetitions: number of repeating blocks
+    //   - block_indices: vector of block indices (0-based) that should have ov::Result consumers
+    //                empty vector means no additional Results, only the final tail Result
+    std::shared_ptr<ov::Model> get_model_with_repeated_blocks_and_results(
+        std::size_t repetitions,
+        const std::vector<std::size_t>& block_indices);
+
+    // Build model with repeating blocks where the final op in each block has multiple outputs (TopK values + indices).
+    //   - repetitions: number of repeating blocks
+    //   - last_block_has_direct_result:
+    //       Option1 (false): for all blocks, multi-output node feeds only the next block; last block feeds only the tail
+    //       Option2 (true): same as above, plus the last block also feeds a direct ov::Result from one of its outputs
+    std::shared_ptr<ov::Model> get_model_with_multi_output_repeating_blocks(
+        std::size_t repetitions,
+        bool last_block_has_direct_result);
 
 private:
     std::shared_ptr<ov::Node> get_block(const std::shared_ptr<ov::Node>& input);
diff --git a/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp

Original file line number	Diff line number	Diff line change
`@@ -135,6 +135,10 @@ std::shared_ptr<ov::Node> Group::getInitialNode() const {`
`135`	`135`	`return *(m_content.begin());`
`136`	`136`	`}`
`137`	`137`
	`138`	`+const std::unordered_set<std::shared_ptr<ov::Node>>& Group::getOutputs() const {`
	`139`	`+ return m_output_layers;`
	`140`	`+}`
	`141`	`+`
`138`	`142`	`void Group::addInput(const std::shared_ptr<ov::Node>& node) {`
`139`	`143`	`m_input_layers.insert(node);`
`140`	`144`	`}`