Skip to content

Commit 223048f

Browse files
dmatveevintelgaoxiongsmirnov-alexey
authored
NPUW: Dynamic attention (#32000)
### Details: - This PR is currently decomposed into the smaller dependency parts (merged): - #32268 - #32270 - #32283 - #32284 ### Tickets: - *EISW-182648* --------- Signed-off-by: intelgaoxiong <[email protected]> Co-authored-by: intelgaoxiong <[email protected]> Co-authored-by: Alexey Smirnov <[email protected]>
1 parent 7bcd29a commit 223048f

29 files changed

+1040
-178
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime);
117117
DEFINE_OPT(NPUW_F16IC, bool, true, npuw::partitioning::f16_interconnect, RunTime);
118118
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, RunTime);
119119
DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, RunTime);
120+
DEFINE_OPT(NPUW_ATTN_DYN, bool, true, npuw::partitioning::attn_dyn, RunTime);
121+
DEFINE_OPT(NPUW_ATTN_NO_COPY, bool, false, npuw::partitioning::attn_no_copy, RunTime);
120122
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, RunTime);
121123
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, RunTime);
122124
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, RunTime);
@@ -126,6 +128,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, R
126128
DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, RunTime);
127129
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
128130
DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
131+
DEFINE_OPT(NPUW_FALLBACK_EXEC, bool, true, npuw::fallback_exec, RunTime);
129132
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
130133
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
131134
DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,22 @@ static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
234234
*/
235235
static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
236236

237+
/**
238+
* @brief
239+
* Type: boolean.
240+
* Enable dynamic dispatch for the attention block, if detected
241+
* Default value: true
242+
*/
243+
static constexpr ov::Property<bool> attn_dyn{"NPUW_ATTN_DYN"};
244+
245+
/**
246+
* @brief
247+
* Type: boolean.
248+
* Force no-copy mode for the attention block, if detected
249+
* Default value: false
250+
*/
251+
static constexpr ov::Property<bool> attn_no_copy{"NPUW_ATTN_NO_COPY"};
252+
237253
/**
238254
* @brief
239255
* Type: boolean
@@ -303,6 +319,14 @@ static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};
303319
*/
304320
static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};
305321

322+
/**
323+
* @brief
324+
* Type: boolean
325+
* Fallback in case of runtime failure
326+
* Default value: true.
327+
*/
328+
static constexpr ov::Property<bool> fallback_exec{"NPUW_FALLBACK_EXEC"};
329+
306330
namespace accuracy {
307331
/**
308332
* @brief

src/plugins/intel_npu/src/al/src/config/npuw.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
3333
desc.add<NPUW_SPATIAL>();
3434
desc.add<NPUW_SPATIAL_NWAY>();
3535
desc.add<NPUW_SPATIAL_DYN>();
36+
desc.add<NPUW_ATTN_DYN>();
37+
desc.add<NPUW_ATTN_NO_COPY>();
3638
desc.add<NPUW_HOST_GATHER>();
3739
desc.add<NPUW_F16IC>();
3840
desc.add<NPUW_DCOFF_TYPE>();
@@ -41,6 +43,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
4143
desc.add<NPUW_PARALLEL_COMPILE>();
4244
desc.add<NPUW_FUNCALL_ASYNC>();
4345
desc.add<NPUW_UNFOLD_IREQS>();
46+
desc.add<NPUW_FALLBACK_EXEC>();
4447
desc.add<NPUW_WEIGHTS_BANK>();
4548
desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
4649
desc.add<NPUW_CACHE_DIR>();

src/plugins/intel_npu/src/plugin/include/properties.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class Properties final {
107107
ov::intel_npu::npuw::partitioning::funcall_for_all.name(),
108108
ov::intel_npu::npuw::funcall_async.name(),
109109
ov::intel_npu::npuw::unfold_ireqs.name(),
110+
ov::intel_npu::npuw::fallback_exec.name(),
110111
ov::intel_npu::npuw::llm::enabled.name(),
111112
ov::intel_npu::npuw::llm::batch_dim.name(),
112113
ov::intel_npu::npuw::llm::seq_len_dim.name(),
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
// Copyright (C) 2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "attention.hpp"
5+
6+
#include "openvino/op/broadcast.hpp"
7+
#include "openvino/op/scaled_dot_product_attention.hpp"
8+
#include "openvino/op/util/op_types.hpp" // is_parameter
9+
#include "util.hpp"
10+
11+
namespace {
12+
enum class SDPA_Inputs : std::size_t { Q = 0, K, V, M, NUM_REQUIRED };
13+
}
14+
15+
std::optional<ov::npuw::function::Attention> ov::npuw::function::Attention::from(
16+
const std::shared_ptr<ov::Model>& model) {
17+
ov::npuw::function::Attention dyn;
18+
19+
// Find the mask input (also sizeable). FIXME: We know too much at this point
20+
auto ops = model->get_ordered_ops();
21+
auto sdpa_iter = std::find_if(ops.begin(), ops.end(), [](auto&& node_ptr) {
22+
return ov::is_type<ov::op::v13::ScaledDotProductAttention>(node_ptr);
23+
});
24+
if (sdpa_iter == ops.end()) {
25+
LOG_WARN("SDPA is not found in the attn subgraph!");
26+
return std::nullopt;
27+
}
28+
29+
// Traverse the SDPA's mask input upwards to find the proper Parameter.
30+
// Only unary ops are allowed along the way
31+
auto sdpa_node = *sdpa_iter;
32+
NPUW_ASSERT(sdpa_node->inputs().size() >= util::_v(SDPA_Inputs::NUM_REQUIRED));
33+
34+
auto mask_in_node = sdpa_node->inputs()[util::_v(SDPA_Inputs::M)].get_source_output().get_node_shared_ptr();
35+
while (mask_in_node && !ov::op::util::is_parameter(mask_in_node)) {
36+
if (mask_in_node->inputs().size() != 1) {
37+
LOG_WARN("Non-unary or disconnected op on the way from SDPA to input mask");
38+
return std::nullopt;
39+
}
40+
mask_in_node = mask_in_node->inputs()[0].get_source_output().get_node_shared_ptr();
41+
}
42+
NPUW_ASSERT(ov::op::util::is_parameter(mask_in_node));
43+
dyn._mask = std::static_pointer_cast<ov::op::v0::Parameter>(mask_in_node);
44+
dyn._mask_shape = dyn._mask->get_shape();
45+
46+
// Find the attention inputs with dynamic range
47+
const auto& f_params = model->get_parameters();
48+
NPUW_ASSERT(f_params.size() > 0);
49+
50+
auto find_context_dim = [&](const auto& param, auto&& f) {
51+
const auto& param_shape = param->get_shape();
52+
// Look for the dynamic parameter size - past size in this case
53+
// With our approach it is context_size - query_size
54+
auto past_len = dyn.context_len() - dyn.query_len();
55+
auto dim_iter = std::find(param_shape.begin(), param_shape.end(), past_len);
56+
if (dim_iter == param_shape.end()) {
57+
// No such dim found
58+
return false;
59+
}
60+
if (std::find(dim_iter + 1, param_shape.end(), past_len) != param_shape.end()) {
61+
// There must be no other such dim
62+
return false;
63+
}
64+
f(std::distance(param_shape.begin(), dim_iter));
65+
return true;
66+
};
67+
68+
for (auto&& param : f_params) {
69+
// A bad test but it is what it is
70+
if (ov::npuw::util::starts_with(param->get_friendly_name(), "past")) {
71+
if (!find_context_dim(param, [&](std::size_t dim_idx) {
72+
dyn._inputs.push_back(ov::npuw::function::Attention::Param{param, dim_idx});
73+
})) {
74+
LOG_WARN("Couldn't identify SDPA parameter's dynamic dimension");
75+
return std::nullopt;
76+
}
77+
}
78+
} // for(f_params)
79+
80+
// There must be exactly two inputs found, for past_k and past_v.
81+
if (dyn._inputs.size() != 2u || !dyn._mask) {
82+
return std::nullopt;
83+
}
84+
85+
// Apply transformation to the model. Note: only function body is modified
86+
// Accumulate the reshape map
87+
std::map<ov::Output<ov::Node>, ov::PartialShape> new_shapes;
88+
for (auto&& p : dyn._inputs) {
89+
ov::PartialShape dyn_shape = p.param->get_shape(); // Here it is yet static
90+
dyn_shape[p.dim] = ov::Dimension(); // ..and now is dynamic
91+
new_shapes[p.param->output(0)] = std::move(dyn_shape);
92+
}
93+
// Mask
94+
{
95+
ov::PartialShape dyn_shape = dyn._mask_shape;
96+
// Put the mask's innermost dimension dynamic
97+
*dyn_shape.rbegin() = ov::Dimension();
98+
new_shapes[dyn._mask->output(0)] = std::move(dyn_shape);
99+
}
100+
model->reshape(new_shapes);
101+
102+
// Patch Broadcast constants if there's any. If there's broadcast in the attention
103+
// block, its shape argument is normally a precomputed Const (which would be
104+
// an expression/a subgraph in the original dynamic IR). Since we retrofit
105+
// dynamism into a static shape environment here, we need to patch it back.
106+
for (auto&& op : model->get_ordered_ops()) {
107+
if (!ov::is_type<ov::op::v3::Broadcast>(op)) {
108+
continue;
109+
}
110+
// Inspect the constant
111+
auto shape_source = op->input(1).get_source_output().get_node_shared_ptr();
112+
if (!ov::is_type<ov::op::v0::Constant>(shape_source)) {
113+
LOG_WARN("SDPA Broadcast's 2nd input is not Const: " << shape_source << ", skipping");
114+
continue;
115+
}
116+
117+
auto shape_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(shape_source);
118+
auto shape_values = shape_const->cast_vector<int32_t>();
119+
for (auto&& d : shape_values) {
120+
// Assume the context length is the mask's innermost dimension
121+
if (static_cast<std::size_t>(d) == dyn.context_len()) {
122+
d = 1;
123+
}
124+
}
125+
auto new_const = std::make_shared<ov::op::v0::Constant>(shape_const->get_element_type(),
126+
shape_const->get_shape(),
127+
shape_values);
128+
op->input(1).replace_source_output(new_const);
129+
}
130+
model->validate_nodes_and_infer_types();
131+
132+
return {std::move(dyn)};
133+
}
134+
135+
ov::npuw::runtime::attention::PositionIDs::PositionIDs(std::size_t param_idx,
136+
const ov::npuw::compiled::Attention& d,
137+
const ov::ISyncInferRequest& rq)
138+
: m_position_ids_idx(param_idx),
139+
m_d(d),
140+
m_rq(rq) {
141+
// FIXME: speculative decode is indistinguishable at this point!
142+
m_case = m_d.query_size == 1 ? Case::GENERATE : Case::PREFILL;
143+
}
144+
145+
ov::npuw::runtime::attention::Selector::Ptr ov::npuw::runtime::attention::PositionIDs::find(
146+
const ov::npuw::compiled::Attention& d,
147+
const ov::ISyncInferRequest& rq) {
148+
auto is_position_ids = [](const ov::Output<const ov::Node>& p) {
149+
const auto& shape = p.get_shape();
150+
// FIXME: 2D/3D position IDs are not supported here YET
151+
return p.get_node()->get_friendly_name() == "position_ids" &&
152+
(shape.size() == 1 || (shape.size() == 2 && shape[0] == 1));
153+
};
154+
155+
const auto& inputs = rq.get_inputs();
156+
auto pos_ids_iter = std::find_if(inputs.begin(), inputs.end(), is_position_ids);
157+
if (pos_ids_iter != inputs.end()) {
158+
const auto param_idx = std::distance(inputs.begin(), pos_ids_iter);
159+
return Selector::Ptr{new PositionIDs(param_idx, d, rq)};
160+
}
161+
return Selector::Ptr{};
162+
}
163+
164+
void ov::npuw::runtime::attention::PositionIDs::prepare() {
165+
const auto& iport = m_rq.get_compiled_model()->inputs()[m_position_ids_idx];
166+
const auto in_tensor = m_rq.get_tensor(iport);
167+
const auto in_dims = in_tensor->get_shape();
168+
169+
// There's several cases possible:
170+
// a. Prefill input_ids, including chunk
171+
// b. Generate input_ids, 1
172+
// c. Generate input_ids, N (speculative)
173+
// Prefill (even chunked) is left-padded, so for (a) it's enough to take the last element.
174+
// Same works for b (there's no choice).
175+
// c may require traversing the tensor backwards as Generate with N>1 is right_padded (?)
176+
177+
auto* pos_data_ptr = in_tensor->data<int64_t>();
178+
for (auto idx = in_dims.back() - 1; idx >= 0; idx--) {
179+
if (pos_data_ptr[idx] > 0) {
180+
// Initialize fields
181+
m_current_length = pos_data_ptr[idx];
182+
switch (m_case) {
183+
case Case::GENERATE:
184+
// decode case, we have pos_id-1 past elements to take from kvcache
185+
m_past_length = m_current_length;
186+
break;
187+
case Case::PREFILL:
188+
// chunked prefill case. calculate the past_length in full chunks
189+
// FIXME: We know too much about chunking here
190+
m_past_length = (m_current_length / m_d.query_size) * m_d.query_size;
191+
break;
192+
default:
193+
NPUW_ASSERT(false && "Reached the unreachable code");
194+
}
195+
return;
196+
}
197+
}
198+
LOG_WARN("Dynamic selector - no data found in the feature?");
199+
m_current_length = -1;
200+
}
201+
202+
int64_t ov::npuw::runtime::attention::PositionIDs::length() const {
203+
return m_current_length;
204+
}
205+
206+
int64_t ov::npuw::runtime::attention::PositionIDs::past_length() const {
207+
return m_past_length;
208+
}

0 commit comments

Comments
 (0)