Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/llm_eagle3_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,72 @@ void Eagle3Extension::update_last_hidden_state(
<< ": Retrieved last_hidden_state output tensor");
}

void Eagle3Extension::accumulate_chunk_last_hidden_state(
const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports,
uint32_t chunk_token_count,
uint32_t total_seq_len) {
if (m_role == Eagle3ModelRole::None) {
return;
}

auto last_hidden_state_it = out_ports.find(Eagle3LayerNames::last_hidden_state);
OPENVINO_ASSERT(last_hidden_state_it != out_ports.end(), "Eagle3 model must have last_hidden_state output port");

auto chunk_output = request->get_tensor(last_hidden_state_it->second);
const auto& chunk_shape = chunk_output->get_shape();

OPENVINO_ASSERT(chunk_shape.size() == 3, "last_hidden_state must have 3 dimensions: [batch, seq_len, hidden_size]");

const uint32_t batch_size = static_cast<uint32_t>(chunk_shape[0]);
const uint32_t chunk_seq_len = static_cast<uint32_t>(chunk_shape[1]);
const uint32_t hidden_size = static_cast<uint32_t>(chunk_shape[2]);

OPENVINO_ASSERT(batch_size == 1, "Batch size must be 1 for Eagle3");
OPENVINO_ASSERT(chunk_token_count <= chunk_seq_len, "chunk_token_count must be <= chunk_seq_len");

// Pre-allocate tensor on first chunk
if (!m_last_hidden_state) {
m_last_hidden_state = ov::get_tensor_impl(
ov::Tensor(chunk_output->get_element_type(), ov::Shape{batch_size, total_seq_len, hidden_size}));
m_chunked_seq_offset = 0;

LOG_VERB("Eagle3: Pre-allocated last_hidden_state tensor with shape=[" << batch_size << "," << total_seq_len
<< "," << hidden_size << "]");
}

const auto& target_shape = m_last_hidden_state->get_shape();
const uint32_t target_total_len = static_cast<uint32_t>(target_shape[1]);

OPENVINO_ASSERT(target_total_len == total_seq_len,
"Pre-allocated tensor size (" + std::to_string(target_total_len) + ") must match total_seq_len (" +
std::to_string(total_seq_len) + ")");

OPENVINO_ASSERT(m_chunked_seq_offset + chunk_token_count <= target_total_len,
"Can't write chunk by stored chunked sequence offset and requested number of tokens, as it will "
"exceed pre-allocated size");

// Extract only the rightmost chunk_token_count tokens from the output
// The chunk_output is right-aligned with padding on the left
constexpr uint32_t seq_dim = 1;
const uint32_t chunk_start_offset = chunk_seq_len - chunk_token_count;

auto chunk_output_slice = util::make_tensor_slice(chunk_output, seq_dim, chunk_start_offset, chunk_seq_len);

auto target_slice = util::make_tensor_slice(m_last_hidden_state,
seq_dim,
m_chunked_seq_offset,
m_chunked_seq_offset + chunk_token_count);

chunk_output_slice->copy_to(target_slice._ptr);

LOG_VERB("Eagle3: Copied chunk [" << chunk_start_offset << ":" << chunk_seq_len << "] to position ["
<< m_chunked_seq_offset << ":" << (m_chunked_seq_offset + chunk_token_count)
<< "], " << chunk_token_count << " tokens");

m_chunked_seq_offset += chunk_token_count;
}

void Eagle3Extension::prepare_inputs_for_chunk(
const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& in_ports,
Expand Down
18 changes: 16 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/llm_eagle3_extension.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,19 @@ class Eagle3Extension {
void update_last_hidden_state(const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports);

ov::SoPtr<ov::ITensor> get_hidden_states() const {
return m_hidden_states;
// Accumulate last_hidden_state from current chunk during chunked prefill
void accumulate_chunk_last_hidden_state(
const std::shared_ptr<ov::IAsyncInferRequest>& request,
const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports,
uint32_t chunk_token_count,
uint32_t total_seq_len);

// Reset chunked prefill state before starting a new chunked prefill session
// NOTE: m_last_hidden_state holds tensors of different sizes in prefill vs generation phases
// Must reset to avoid size mismatch when starting a new prefill after previous generations
void reset_chunked_prefill_state() {
m_last_hidden_state = {};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why we need to do this?
It means on each prefill stage we are allocating a new tensor. Why?

Copy link
Contributor Author

@GuoliangShiIntel GuoliangShiIntel Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. Please consider this scenario:

If we run two prompts consecutively using infer:

For the first prompt: m_last_hidden_state is null -> pre-allocate tensor for the full tensor -> copy each chunk's last_hidden_state into pre-allocated memory

After the first prefill completes, the generate phase also updates m_last_hidden_state. When the generate phase finishes, m_last_hidden_state remains non-null.

For the second prompt: Since m_last_hidden_state is still non-null, prefill will not enter the "Pre-allocate tensor on first chunk" path, causing a memory size mismatch that triggers the assertion.

Given that each prompt inference only prefill once, it's reasonable to reset the tensor here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So m_last_hidden_state is pointing to different tensors in prefill and generate phase?
It would be nice to have an explanatory comment here.

Having allocation per prefill is not a big deal I think. But we also can have pre-allocated tensor for prefill phase and not allocate it every time.

m_chunked_seq_offset = 0;
}

ov::SoPtr<ov::ITensor> get_last_hidden_state() const {
Expand All @@ -91,6 +102,9 @@ class Eagle3Extension {

ov::SoPtr<ov::ITensor> m_hidden_states; ///< Draft model input: hidden_states
ov::SoPtr<ov::ITensor> m_last_hidden_state; ///< Draft/Target model output: last_hidden_state

// For chunked prefill: track the write offset in the pre-allocated tensor
uint32_t m_chunked_seq_offset = 0;
};

} // namespace npuw
Expand Down
16 changes: 15 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,10 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
remaining_prompts = cache_context.remaining_prompts;
}

if (m_eagle3_ext.is_eagle3_model()) {
m_eagle3_ext.reset_chunked_prefill_state();
}

while (remaining_prompts > 0) {
// NB: input_ids can be either fp32(VLM) or i64(LLM)
// The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible by
Expand Down Expand Up @@ -704,6 +708,14 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp

m_prefill_request->infer();

// Accumulate Eagle3 last_hidden_state from this chunk
if (m_eagle3_ext.is_eagle3_model()) {
m_eagle3_ext.accumulate_chunk_last_hidden_state(m_prefill_request,
m_prefill_out_ports,
static_cast<uint32_t>(current_prompts_len),
static_cast<uint32_t>(input_prompt_len));
}

if (enable_prefix_caching) {
m_prefix_caching_helper->store_computed_blocks(current_prompts_len,
cache_context.prompt_hashes,
Expand Down Expand Up @@ -817,7 +829,9 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
}

if (m_eagle3_ext.is_eagle3_model()) {
// Update last_hidden_state only for non-chunked prefill
// For chunked prefill, accumulate_chunk_last_hidden_state() already set the tensor
if (m_eagle3_ext.is_eagle3_model() && !use_chunk_prefill) {
m_eagle3_ext.update_last_hidden_state(m_prefill_request, m_prefill_out_ports);
}

Expand Down
Loading