-
Notifications
You must be signed in to change notification settings - Fork 3k
[NPUW] Fix eagle3 with chunk prefill #33975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,8 +76,19 @@ class Eagle3Extension { | |
| void update_last_hidden_state(const std::shared_ptr<ov::IAsyncInferRequest>& request, | ||
| const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports); | ||
|
|
||
| ov::SoPtr<ov::ITensor> get_hidden_states() const { | ||
| return m_hidden_states; | ||
| // Accumulate last_hidden_state from current chunk during chunked prefill | ||
| void accumulate_chunk_last_hidden_state( | ||
| const std::shared_ptr<ov::IAsyncInferRequest>& request, | ||
| const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports, | ||
| uint32_t chunk_token_count, | ||
| uint32_t total_seq_len); | ||
|
|
||
| // Reset chunked prefill state before starting a new chunked prefill session | ||
| // NOTE: m_last_hidden_state holds tensors of different sizes in prefill vs generation phases | ||
| // Must reset to avoid size mismatch when starting a new prefill after previous generations | ||
| void reset_chunked_prefill_state() { | ||
| m_last_hidden_state = {}; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we need to do this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. Please consider this scenario: If we run two prompts consecutively using infer: For the first prompt: After the first prefill completes, the generate phase also updates For the second prompt: Since Given that each prompt inference only prefill once, it's reasonable to reset the tensor here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So Having allocation per prefill is not a big deal I think. But we also can have pre-allocated tensor for prefill phase and not allocate it every time. |
||
| m_chunked_seq_offset = 0; | ||
| } | ||
|
|
||
| ov::SoPtr<ov::ITensor> get_last_hidden_state() const { | ||
|
|
@@ -91,6 +102,9 @@ class Eagle3Extension { | |
|
|
||
| ov::SoPtr<ov::ITensor> m_hidden_states; ///< Draft model input: hidden_states | ||
| ov::SoPtr<ov::ITensor> m_last_hidden_state; ///< Draft/Target model output: last_hidden_state | ||
|
|
||
| // For chunked prefill: track the write offset in the pre-allocated tensor | ||
| uint32_t m_chunked_seq_offset = 0; | ||
| }; | ||
|
|
||
| } // namespace npuw | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.