Skip to content

Commit cc1ac68

Browse files
committed
Avoid inserting Convert operations for irregular ov::Result case
1 parent 95300df commit cc1ac68

File tree

10 files changed

+321
-24
lines changed

10 files changed

+321
-24
lines changed

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
2929

3030
pugi::xml_node node = doc.append_child("ensemble");
3131
node.append_attribute("gflops") = std::to_string(ens.gflops).data();
32+
node.append_attribute("irregular_results") = std::to_string(ens.irregular_results).data();
3233

3334
pugi::xml_node part = node.append_child("partitioning");
3435
pugi::xml_node rep;
@@ -83,6 +84,7 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
8384

8485
doc.save_file(to.data());
8586
}
87+
8688
} // namespace detail
8789

8890
// Interface to get online partitioning from the model
@@ -308,6 +310,7 @@ class Compiler {
308310

309311
ov::npuw::Ensemble ens;
310312
ens.gflops = 1.; // FIXME: calculate proper flops
313+
ens.irregular_results = !m_snapshot->isRegularResultCase();
311314

312315
auto graph = m_snapshot->getGraph();
313316
// Iterate in topological order

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ std::shared_ptr<ov::Node> Group::getInitialNode() const {
135135
return *(m_content.begin());
136136
}
137137

138+
const std::unordered_set<std::shared_ptr<ov::Node>>& Group::getOutputs() const {
139+
return m_output_layers;
140+
}
141+
138142
void Group::addInput(const std::shared_ptr<ov::Node>& node) {
139143
m_input_layers.insert(node);
140144
}

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ class Group : public std::enable_shared_from_this<Group> {
4949
own::ade::NodeHandle getHandle() const;
5050
// Note: can only be used during initial group initialization
5151
std::shared_ptr<ov::Node> getInitialNode() const;
52+
const std::unordered_set<std::shared_ptr<ov::Node>>& getOutputs() const;
5253
void addInput(const std::shared_ptr<ov::Node>& node);
5354
void addOutput(const std::shared_ptr<ov::Node>& node);
5455
void addContent(const std::shared_ptr<ov::Node>& node);

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,101 @@ void Snapshot::stripTag(const std::string& tag) {
12701270
}
12711271
}
12721272

1273+
bool Snapshot::isRegularResultCase() const {
1274+
LOG_INFO("Online partitioning: executing isRegularResultCase pass...");
1275+
LOG_BLOCK();
1276+
1277+
// This method works around an issue where the final partitioning fails the sanity check
1278+
// because of a different number of output Convert across repeated block groups.
1279+
// The issue was initially observed in a model where only the final block has an additional ov::Result consumer.
1280+
// For example, Group[0..30] has only external consumers (i.e. consumers that belong to other groups):
1281+
// OpA -> OpB(external group)
1282+
// -> OpC(external group)
1283+
// but very last Group[31] has an additional ov::Result consumer:
1284+
// OpA -> ov::Result
1285+
// -> OpB(external group)
1286+
// -> OpC(external group)
1287+
// Later, if NPUW_F16IC is set, "Partitioner::identifySubgraphs" method adds output Converts to each Group[0..30],
1288+
// but skips Group[31] due to internal implementation details.
1289+
// "Partitioner::identifySubgraphs" can't:
1290+
// - add Convert to the Group[31] because it would require adding opposite Convert for the ov::Result
1291+
// - skip adding Converts to Group[0..30] because it would break symmetry of the repeated blocks, i.e.
1292+
// in the given graph `Convert(group0) -> output -> input -> Convert(group1)` input `Convert(group1)` should be also eliminated
1293+
// Therefore, we disable F16IC early in such cases.
1294+
1295+
using NodeSPtr = std::shared_ptr<ov::Node>;
1296+
std::unordered_map<std::string, NodeSPtr> node_id_cache;
1297+
for (auto&& node_ptr : m_model->get_ordered_ops()) {
1298+
node_id_cache[node_ptr->get_friendly_name()] = node_ptr;
1299+
}
1300+
1301+
auto getReadersMask = [](const NodeSPtr& node_ptr) {
1302+
// each element of the vector is
1303+
// the number of ov::Result readers for the corresponding output
1304+
std::vector<int> mask;
1305+
for (auto&& output_desc : node_ptr->outputs()) {
1306+
auto readers = output_desc.get_target_inputs();
1307+
int result_count = 0;
1308+
for (auto&& r : readers) {
1309+
auto reader_node_ptr = r.get_node()->shared_from_this();
1310+
if (ov::op::util::is_output(reader_node_ptr)) {
1311+
result_count++;
1312+
}
1313+
}
1314+
mask.push_back(result_count);
1315+
}
1316+
return mask;
1317+
};
1318+
1319+
auto reptag_to_gset = repeating();
1320+
if(!reptag_to_gset.empty()) {
1321+
NPUW_ASSERT(!m_layer_matches.empty());
1322+
}
1323+
1324+
for (const auto& reptag_and_gset : reptag_to_gset) {
1325+
auto reptag = reptag_and_gset.first;
1326+
auto gset = reptag_and_gset.second;
1327+
1328+
auto matches = m_layer_matches.at(reptag->id());
1329+
1330+
if(gset.size() <= 1) {
1331+
continue;
1332+
}
1333+
1334+
auto firstGroup = *(gset.begin());
1335+
for(auto output_layer: firstGroup->getOutputs()) {
1336+
// this is the reference mask expected from all other matched layers
1337+
// in the remaining groups of the repeated block
1338+
auto expected_readers_mask = getReadersMask(output_layer);
1339+
1340+
auto this_layer_name = output_layer->get_friendly_name();
1341+
auto layer_bank_iter =
1342+
std::find_if(matches.begin(), matches.end(), [&](const std::set<std::string>& lrs) {
1343+
return lrs.count(this_layer_name) > 0;
1344+
});
1345+
1346+
NPUW_ASSERT(layer_bank_iter != matches.end());
1347+
1348+
// match output layers across all groups in the repeated block
1349+
// and compare their readers mask
1350+
for(const auto& layer_name: *layer_bank_iter) {
1351+
auto layer_ptr = node_id_cache.at(layer_name);
1352+
auto actual_readers_mask = getReadersMask(layer_ptr);
1353+
1354+
if (actual_readers_mask != expected_readers_mask) {
1355+
LOG_INFO("This is NOT a regular result case. Readers mask mismatch found for "
1356+
<< layer_name << " and " << this_layer_name << " output layers.");
1357+
return false;
1358+
}
1359+
}
1360+
}
1361+
}
1362+
1363+
LOG_INFO("This is a regular result case");
1364+
LOG_INFO("DONE");
1365+
return true;
1366+
}
1367+
12731368
size_t Snapshot::getNextRepId() {
12741369
return m_current_rep_count++;
12751370
}

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ class Snapshot : public std::enable_shared_from_this<Snapshot> {
5656

5757
void stripTag(const std::string& tag);
5858

59+
// Passes to detect corner cases
60+
bool isRegularResultCase() const;
61+
5962
// Utility
6063
std::shared_ptr<own::ade::Graph> getGraph() const;
6164
const detail::OVPortsMap& getPortsMap() const;

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ ov::npuw::Ensemble load_groups(const std::shared_ptr<ov::Model>& model, const st
182182

183183
LOG_INFO("Found " << repeated.size() << " different repeated block(s)");
184184

185-
return ov::npuw::Ensemble{get_float_attr(root, "gflops"), std::move(partitions), std::move(repeated)};
185+
return ov::npuw::Ensemble{get_float_attr(root, "gflops"), get_bool_attr(root, "irregular_results", false), std::move(partitions), std::move(repeated)};
186186
}
187187

188188
class Partitioner {
@@ -376,7 +376,7 @@ void Partitioner::identifySubgraphs() {
376376
LOG_INFO("Identifying subgraphs for model " << model->get_friendly_name() << "...");
377377
LOG_BLOCK();
378378

379-
const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>();
379+
const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>() && !ens.irregular_results;
380380

381381
using namespace ov::npuw;
382382
std::vector<ov::npuw::Group>& partitions = ens.groups;

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ struct RepeatedBlock {
158158

159159
struct Ensemble {
160160
float gflops;
161+
bool irregular_results;
161162
std::vector<Group> groups;
162163

163164
// Just a map as I don't expect 100s of _different_

src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.cpp

Lines changed: 120 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_without_repeated_blocks() {
3939
}
4040

4141
std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::size_t repetitions) {
42+
return get_model_with_repeated_blocks_and_results(repetitions, {});
43+
}
44+
45+
std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks() {
46+
return get_model_with_repeated_blocks(10);
47+
}
48+
49+
std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks_and_results(
50+
std::size_t repetitions,
51+
const std::vector<std::size_t>& block_indices) {
4252
// Generate head
4353
std::shared_ptr<ov::op::v0::Parameter> input = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::Shape{1, 1, 40});
4454
m_nodes.push_back(input);
@@ -60,17 +70,17 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::s
6070

6171
// Generate repeated blocks
6272
std::shared_ptr<ov::Node> output = get_block(head[6]);
63-
std::vector<std::shared_ptr<ov::Node>> outputs;
64-
outputs.push_back(output);
73+
std::vector<std::shared_ptr<ov::Node>> block_outputs;
74+
block_outputs.push_back(output);
6575

6676
for (size_t i = 0; i < repetitions - 1; ++i) {
6777
output = get_block(output);
68-
outputs.push_back(output);
78+
block_outputs.push_back(output);
6979
}
7080

7181
// Generate tail
7282
std::vector<std::shared_ptr<ov::Node>> tail(6, nullptr);
73-
tail[0] = std::make_shared<ov::op::v0::Concat>(outputs, -1);
83+
tail[0] = std::make_shared<ov::op::v0::Concat>(block_outputs, -1);
7484
tail[1] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int>{1, 40, int(repetitions)});
7585
tail[2] = std::make_shared<ov::op::v1::Reshape>(tail[0], tail[1], false);
7686
tail[3] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1});
@@ -82,19 +92,119 @@ std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks(std::s
8292
set_name(t);
8393
}
8494

95+
// Create Results
96+
ov::ResultVector results;
97+
98+
// Add Results for specified blocks
99+
for (size_t idx : block_indices) {
100+
if (idx < block_outputs.size()) {
101+
auto result = std::make_shared<ov::op::v0::Result>(block_outputs[idx]);
102+
m_nodes.push_back(result);
103+
set_name(result);
104+
results.push_back(result);
105+
}
106+
}
107+
85108
// Create model
86-
auto result = std::make_shared<ov::op::v0::Result>(tail[5]);
87-
m_nodes.push_back(result);
88-
set_name(result);
109+
// Always add final tail Result
110+
auto final_result = std::make_shared<ov::op::v0::Result>(tail[5]);
111+
m_nodes.push_back(final_result);
112+
set_name(final_result);
113+
results.push_back(final_result);
89114

90115
ov::ParameterVector params = {input};
91-
ov::ResultVector results = {result};
92116

93117
return std::make_shared<ov::Model>(results, params);
94118
}
95119

96-
std::shared_ptr<ov::Model> ModelGenerator::get_model_with_repeated_blocks() {
97-
return get_model_with_repeated_blocks(10);
120+
std::shared_ptr<ov::Model> ModelGenerator::get_model_with_multi_output_repeating_blocks(
121+
std::size_t repetitions,
122+
bool last_block_has_direct_result) {
123+
if (repetitions == 0) {
124+
repetitions = 1; // keep the model non-empty
125+
}
126+
127+
auto input = std::make_shared<ov::opset11::Parameter>(ov::element::f32, ov::Shape{1, 1, 8});
128+
m_nodes.push_back(input);
129+
set_name(input);
130+
131+
// Shared constants
132+
auto add_const = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {1.f});
133+
auto k_const = ov::opset11::Constant::create(ov::element::i64, ov::Shape{}, {8});
134+
auto seed_indices = ov::opset11::Constant::create(ov::element::i32,
135+
ov::Shape{1, 1, 8},
136+
{0, 1, 2, 3, 4, 5, 6, 7});
137+
auto tail_scale = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {0.5f});
138+
auto tail_bias = ov::opset11::Constant::create(ov::element::f32, ov::Shape{1}, {2.f});
139+
140+
for (const auto& c : {add_const, k_const, seed_indices, tail_scale, tail_bias}) {
141+
m_nodes.push_back(c);
142+
set_name(c);
143+
}
144+
145+
ov::Output<ov::Node> current_values = input;
146+
ov::Output<ov::Node> current_indices = seed_indices;
147+
148+
for (std::size_t i = 0; i < repetitions; ++i) {
149+
// Build block body; TopK remains the final op of each block to expose multiple outputs
150+
auto indices_as_float = std::make_shared<ov::opset11::Convert>(current_indices, ov::element::f32);
151+
m_nodes.push_back(indices_as_float);
152+
set_name(indices_as_float);
153+
154+
auto mixed = std::make_shared<ov::opset11::Add>(current_values, indices_as_float);
155+
m_nodes.push_back(mixed);
156+
set_name(mixed);
157+
158+
auto shifted = std::make_shared<ov::opset11::Add>(mixed, add_const);
159+
m_nodes.push_back(shifted);
160+
set_name(shifted);
161+
162+
auto topk = std::make_shared<ov::opset11::TopK>(shifted,
163+
k_const,
164+
-1,
165+
ov::op::TopKMode::MAX,
166+
ov::op::TopKSortType::SORT_VALUES,
167+
ov::element::i32);
168+
m_nodes.push_back(topk);
169+
set_name(topk);
170+
171+
current_values = topk->output(0);
172+
current_indices = topk->output(1);
173+
}
174+
175+
// Tail consumes the final block outputs
176+
auto tail_indices_as_float = std::make_shared<ov::opset11::Convert>(current_indices, ov::element::f32);
177+
m_nodes.push_back(tail_indices_as_float);
178+
set_name(tail_indices_as_float);
179+
180+
auto tail_mixed = std::make_shared<ov::opset11::Add>(current_values, tail_indices_as_float);
181+
m_nodes.push_back(tail_mixed);
182+
set_name(tail_mixed);
183+
184+
auto tail_mul = std::make_shared<ov::opset11::Multiply>(tail_mixed, tail_scale);
185+
m_nodes.push_back(tail_mul);
186+
set_name(tail_mul);
187+
188+
auto tail_add = std::make_shared<ov::opset11::Add>(tail_mul, tail_bias);
189+
m_nodes.push_back(tail_add);
190+
set_name(tail_add);
191+
192+
ov::ResultVector results;
193+
auto tail_result = std::make_shared<ov::opset11::Result>(tail_add);
194+
m_nodes.push_back(tail_result);
195+
set_name(tail_result);
196+
results.push_back(tail_result);
197+
198+
if (last_block_has_direct_result) {
199+
auto direct_result = std::make_shared<ov::opset11::Result>(current_values);
200+
m_nodes.push_back(direct_result);
201+
set_name(direct_result);
202+
results.push_back(direct_result);
203+
}
204+
205+
ov::ParameterVector params = {input};
206+
207+
return std::make_shared<ov::Model>(results, params);
98208
}
99209

100210
std::shared_ptr<ov::Node> ModelGenerator::get_block(const std::shared_ptr<ov::Node>& input) {

src/plugins/intel_npu/tests/unit/npuw/model_generator/model_generator.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,23 @@ class ModelGenerator {
1414
std::shared_ptr<ov::Model> get_model_without_repeated_blocks();
1515
std::shared_ptr<ov::Model> get_model_with_repeated_blocks(std::size_t repetitions);
1616
std::shared_ptr<ov::Model> get_model_with_repeated_blocks();
17+
18+
// Build model with repeating blocks and configurable ov::Result consumers:
19+
// - repetitions: number of repeating blocks
20+
// - block_indices: vector of block indices (0-based) that should have ov::Result consumers
21+
// empty vector means no additional Results, only the final tail Result
22+
std::shared_ptr<ov::Model> get_model_with_repeated_blocks_and_results(
23+
std::size_t repetitions,
24+
const std::vector<std::size_t>& block_indices);
25+
26+
// Build model with repeating blocks where the final op in each block has multiple outputs (TopK values + indices).
27+
// - repetitions: number of repeating blocks
28+
// - last_block_has_direct_result:
29+
// Option1 (false): for all blocks, multi-output node feeds only the next block; last block feeds only the tail
30+
// Option2 (true): same as above, plus the last block also feeds a direct ov::Result from one of its outputs
31+
std::shared_ptr<ov::Model> get_model_with_multi_output_repeating_blocks(
32+
std::size_t repetitions,
33+
bool last_block_has_direct_result);
1734

1835
private:
1936
std::shared_ptr<ov::Node> get_block(const std::shared_ptr<ov::Node>& input);

0 commit comments

Comments
 (0)