@@ -274,6 +274,27 @@ std::pair<uint32_t, uint32_t> get_lora_dims_by_name(const std::string& state_nam
274
274
return std::make_pair (low_rank_dim, full_rank_dim);
275
275
}
276
276
277
+ void copy_to_right (const ov::SoPtr<ov::ITensor>& src, const ov::SoPtr<ov::ITensor>& dst) {
278
+ OPENVINO_ASSERT (src->get_byte_size () <= dst->get_byte_size ());
279
+ std::copy_n (reinterpret_cast <uint8_t *>(src->data ()),
280
+ src->get_byte_size (),
281
+ reinterpret_cast <uint8_t *>(dst->data ()) + dst->get_byte_size () - src->get_byte_size ());
282
+ }
283
+
284
+ void fill_sliding_mask (const ov::SoPtr<ov::ITensor>& mask, int64_t curr_pos, int64_t window_size) {
285
+ auto start = curr_pos - window_size;
286
+ auto end = curr_pos;
287
+
288
+ auto * mask_data = mask->data <bool >();
289
+ for (int64_t i = 0 ; i < static_cast <int64_t >(mask->get_size ()); ++i) {
290
+ // Unlike original subgraph which do i <= end we are excluding end
291
+ // as it is a new token and is located in last position of mask buffer
292
+ mask_data[i] = i > start && i < end;
293
+ }
294
+
295
+ mask_data[mask->get_size () - 1 ] = true ;
296
+ }
297
+
277
298
constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1 ;
278
299
279
300
constexpr std::size_t kStartOutputKVCacheLayers = 1 ;
@@ -380,6 +401,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
380
401
}
381
402
382
403
m_generate_initialized = false ;
404
+ m_gemma_sliding_window_size = compiled_model->m_gemma_sliding_window_size ;
383
405
}
384
406
385
407
void ov::npuw::LLMInferRequest::init_tensor (const ov::Output<const ov::Node>& port) {
@@ -498,6 +520,10 @@ void ov::npuw::LLMInferRequest::apply_lora() {
498
520
499
521
void ov::npuw::LLMInferRequest::prepare_for_new_conversation () {
500
522
fill_tensor_bytes (m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name)), 0u );
523
+ if (auto type_ids_port = m_prefill_in_ports.find (layer_names::token_type_ids);
524
+ type_ids_port != m_prefill_in_ports.end ()) {
525
+ fill_tensor_bytes (m_prefill_request->get_tensor (type_ids_port->second ), 0u );
526
+ }
501
527
fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask)), 0 );
502
528
fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids)), 0 );
503
529
m_npuw_llm_compiled_model->m_kvcache_desc .num_stored_tokens = 0u ;
@@ -586,8 +612,8 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
586
612
587
613
void ov::npuw::LLMInferRequest::update_kvcache_for (
588
614
std::shared_ptr<ov::IAsyncInferRequest> request,
589
- std::unordered_map<std::string, ov::Output<const ov::Node>> in_ports,
590
- std::unordered_map<std::string, ov::Output<const ov::Node>> out_ports,
615
+ const std::unordered_map<std::string, ov::Output<const ov::Node>>& in_ports,
616
+ const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports,
591
617
uint32_t num_tokens,
592
618
bool v_transposed) {
593
619
LOG_DEBUG (" Store computed key and values for passed number of tokens in the input kv-cache"
@@ -750,7 +776,8 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
750
776
751
777
void ov::npuw::LLMInferRequest::infer_whole_prefill (ov::SoPtr<ov::ITensor> input_ids,
752
778
ov::SoPtr<ov::ITensor> attention_mask,
753
- ov::SoPtr<ov::ITensor> position_ids) {
779
+ ov::SoPtr<ov::ITensor> position_ids,
780
+ ov::SoPtr<ov::ITensor> token_type_ids) {
754
781
LOG_DEBUG (" Calling inference for prefill model in a single launch." );
755
782
LOG_BLOCK ();
756
783
@@ -767,6 +794,13 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
767
794
attention_mask->get_size (),
768
795
padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () - attention_mask->get_size ());
769
796
797
+ if (token_type_ids) {
798
+ auto padded_token_type_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::token_type_ids));
799
+
800
+ std::fill_n (reinterpret_cast <uint8_t *>(padded_token_type_ids->data ()), token_type_ids->get_byte_size (), 0 );
801
+ copy_to_right (token_type_ids, padded_token_type_ids);
802
+ }
803
+
770
804
auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids));
771
805
pad_position_ids (padded_position_ids, position_ids);
772
806
@@ -779,7 +813,8 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
779
813
780
814
void ov::npuw::LLMInferRequest::infer_prefill (ov::SoPtr<ov::ITensor> input_ids,
781
815
ov::SoPtr<ov::ITensor> attention_mask,
782
- ov::SoPtr<ov::ITensor> position_ids) {
816
+ ov::SoPtr<ov::ITensor> position_ids,
817
+ ov::SoPtr<ov::ITensor> token_type_ids) {
783
818
LOG_DEBUG (" Calling inference for prefill model..." );
784
819
LOG_BLOCK ();
785
820
@@ -795,9 +830,12 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
795
830
796
831
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
797
832
if (use_chunk_prefill) {
833
+ OPENVINO_ASSERT (m_gemma_sliding_window_size == 0 ,
834
+ " Chunking is not implemented for Gemma model family yet. "
835
+ " Please use set NPUW_LLM_PREFILL_HINT to 'STATIC'" );
798
836
infer_chunked_prefill (input_ids, attention_mask, position_ids);
799
837
} else {
800
- infer_whole_prefill (input_ids, attention_mask, position_ids);
838
+ infer_whole_prefill (input_ids, attention_mask, position_ids, token_type_ids );
801
839
}
802
840
803
841
if (m_lm_head_request) {
@@ -815,7 +853,8 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
815
853
816
854
void ov::npuw::LLMInferRequest::infer_generate (ov::SoPtr<ov::ITensor> input_ids,
817
855
ov::SoPtr<ov::ITensor> attention_mask,
818
- ov::SoPtr<ov::ITensor> position_ids) {
856
+ ov::SoPtr<ov::ITensor> position_ids,
857
+ ov::SoPtr<ov::ITensor> token_type_ids) {
819
858
LOG_DEBUG (" Calling inference for generate model..." );
820
859
LOG_BLOCK ();
821
860
auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
@@ -834,6 +873,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
834
873
fill_tensor_bytes (m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name)), 0u );
835
874
fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::attention_mask)), 0 );
836
875
fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::position_ids)), 0 );
876
+ if (token_type_ids) {
877
+ fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids)), 0 );
878
+ }
837
879
m_generate_initialized = true ;
838
880
}
839
881
@@ -842,6 +884,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
842
884
OPENVINO_THROW (" KV-Cache is full." );
843
885
}
844
886
887
+ if (auto sliding_mask_port = m_kvcache_in_ports.find (layer_names::gemma_sliding_mask);
888
+ sliding_mask_port != m_kvcache_in_ports.end ()) {
889
+ // TODO: Fill once and update on each iteration instead
890
+ fill_sliding_mask (m_kvcache_request->get_tensor (sliding_mask_port->second ),
891
+ kvcache_desc.num_stored_tokens + input_tokens_len,
892
+ m_gemma_sliding_window_size);
893
+ }
894
+
845
895
// FIXME: these tensors should be shared between the parent & child models
846
896
// NB: input_ids can be either fp32(VLM) or i64(LLM)
847
897
auto kv_input_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name));
@@ -854,6 +904,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
854
904
input_ids->get_byte_size (),
855
905
reinterpret_cast <uint8_t *>(kv_input_ids->data ()) + kv_input_ids->get_byte_size () - input_ids->get_byte_size ());
856
906
907
+ if (token_type_ids) {
908
+ auto kv_token_type_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids));
909
+ copy_to_right (token_type_ids, kv_token_type_ids);
910
+ }
911
+
857
912
// NOTE: Attention mask pattern for generate model requires the set of "1"
858
913
// units of length of the current prompt on the right (for present
859
914
// kv layers) and the set of "1" units of number of previously calculated
@@ -912,12 +967,28 @@ void ov::npuw::LLMInferRequest::infer() {
912
967
// FIXME: position_ids might be optional for some models!
913
968
auto position_ids = get_tensor (find_port_by_name (inputs, layer_names::position_ids).value ());
914
969
970
+ auto token_type_ids = ov::npuw::util::TensorPtr ();
971
+
972
+ if (auto type_ids_port = find_port_by_name (inputs, layer_names::token_type_ids); type_ids_port.has_value ()) {
973
+ token_type_ids = get_tensor (type_ids_port.value ());
974
+ }
975
+
915
976
// NB: For VLM, the "inputs_embeds" contains float values (embeddings)
916
977
OPENVINO_ASSERT (ov::element::f32 == input_ids->get_element_type () ||
917
978
ov::element::i64 == input_ids->get_element_type ());
918
979
OPENVINO_ASSERT (ov::element::i64 == attention_mask->get_element_type ());
919
980
OPENVINO_ASSERT (ov::element::i64 == position_ids->get_element_type ());
920
981
982
+ if (m_first_run) {
983
+ // Most of the models have position_ids->data<int64_t>()[0] == 0 for the first infer
984
+ // But gemma3 has it == 1
985
+ // We need to store original first position id in order to distinguish between prefill and generate stage
986
+ // While in most of the cases we need to do prefill only once, it is not true for chat mode
987
+ // where we need to do prefill on each user input.
988
+ m_first_position_id = position_ids->data <int64_t >()[0 ];
989
+ m_first_run = false ;
990
+ }
991
+
921
992
// NB: Check the sequence length provided for input_ids
922
993
// and start position idx in order to distinguish prefill
923
994
// and generate stages.
@@ -940,11 +1011,11 @@ void ov::npuw::LLMInferRequest::infer() {
940
1011
// The outcome of two items is that prefill and generate stages
941
1012
// can be safely differentiated by start position id for
942
1013
// both main and draft models.
943
- if (input_ids->get_shape ()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data <int64_t >()[0 ] == 0 ) {
944
- infer_prefill (input_ids, attention_mask, position_ids);
1014
+ if (input_ids->get_shape ()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data <int64_t >()[0 ] == m_first_position_id ) {
1015
+ infer_prefill (input_ids, attention_mask, position_ids, token_type_ids );
945
1016
} else {
946
1017
trim_kvcache_for_speculative_decoding (position_ids);
947
- infer_generate (input_ids, attention_mask, position_ids);
1018
+ infer_generate (input_ids, attention_mask, position_ids, token_type_ids );
948
1019
}
949
1020
}
950
1021
0 commit comments