openvinotoolkit
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp‎
Lines changed: 130 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp‎
Lines changed: 3 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp‎
Lines changed: 80 additions & 9 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp‎
Lines changed: 80 additions & 9 deletions
@@ -303,6 +303,95 @@ class GroupQueryAttentionDecomposition : public ov::pass::MatcherPass {
     }
 };
 
+class GemmaSlidingMask : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::GemmaSlidingMask");
+
+    struct Result {
+        Result() = default;
+
+        bool found = false;
+        int32_t window_size = 0;
+        std::shared_ptr<ov::op::v0::Parameter> mask_input;
+    };
+
+    explicit GemmaSlidingMask(Result* result) {
+        // Searching for gemma sliding mask pattern and replace it's output
+        // with Paramater of the same size and type.
+        /*                                                              -\
+          range_w -> unsqueeze -> unsqueeze -> unsqueeze1 -> convert -\/  => LessEqual -\
+                                                               \      /\-/               \
+                                                          /-----\----/                    =>BWAnd_res
+                                                         /       \                       /
+          range_h -> unsqueeze -> unsqueeze -> unsqueeze2 -> add -=> Greater -> BWAnd --/
+                                   const (-window_size) ----/
+        */
+        // Basically this subgrapgh is doing the following:
+        // range_w is range (0, ..., width - 1) (probably + something)
+        // renge_h is range (0, ..., height - 1)  (probably + something)
+        // And then doing the following check:
+        // y - window_size < x <= y
+        // Producing squared sliding mask:
+        // 1 0 0 0 0 0
+        // 1 1 0 0 0 0
+        // 1 1 1 0 0 0
+        // 0 1 1 1 0 0
+        // 0 0 1 1 1 0
+        // 0 0 0 1 1 1
+        //
+        // Please also note, that sliding windows size is stored as negative value and the
+        // subgraph is actually doing:
+        // y + (negative)window_size < x <= y
+
+        auto range_sequence = [&]() {
+            auto range = opp::wrap_type<ov::op::v4::Range>({opp::any_input(), opp::any_input(), opp::any_input()});
+            auto unsqueeze1 = opp::wrap_type<ov::op::v0::Unsqueeze>({range, opp::any_input()});
+            auto unsqueeze2 = opp::wrap_type<ov::op::v0::Unsqueeze>({unsqueeze1, opp::any_input()});
+            auto unsqueeze3 = opp::wrap_type<ov::op::v0::Unsqueeze>({unsqueeze2, opp::any_input()});
+
+            return unsqueeze3;
+        };
+
+        auto unsqueeze1 = range_sequence();
+        auto convert = opp::wrap_type<ov::op::v0::Convert>({unsqueeze1});
+        auto unsqueeze2 = range_sequence();
+        auto window_size = opp::wrap_type<ov::op::v0::Constant>();
+        auto add = opp::wrap_type<ov::op::v1::Add>({unsqueeze2, window_size});
+        auto greater = opp::wrap_type<ov::op::v1::Greater>({convert, add});
+        auto bwand = opp::wrap_type<ov::op::v13::BitwiseAnd>({opp::any_input(), greater});
+        auto less_equal = opp::wrap_type<ov::op::v1::LessEqual>({convert, unsqueeze2});
+        auto bwand_res = opp::wrap_type<ov::op::v13::BitwiseAnd>({bwand, less_equal});
+
+        auto callback = [=](ov::pass::pattern::Matcher& m) {
+            auto& node_to_output = m.get_pattern_value_map();
+            auto* bwand_matched_node = node_to_output.at(bwand_res).get_node();
+            auto* window_size_node = node_to_output.at(window_size).get_node();
+            auto output = bwand_matched_node->output(0);
+            auto target_inputs = output.get_target_inputs();
+
+            auto* window_size_constant = static_cast<ov::op::v0::Constant*>(window_size_node);
+            OPENVINO_ASSERT(window_size_constant->get_output_size() == 1,
+                            "Sliding window size constant must be of size 1, but got " +
+                                std::to_string(window_size_constant->get_output_size()));
+            OPENVINO_ASSERT(!result->found, "Second gemma sliding mask pattern found, what is unexpected!");
+
+            auto input = std::make_shared<ov::op::v0::Parameter>(output.get_element_type(), output.get_partial_shape());
+            input->set_friendly_name(ov::npuw::LLMInferRequest::layer_names::gemma_sliding_mask);
+            output.replace(input->output(0));
+
+            auto window_size_vec = window_size_constant->cast_vector<int32_t>(1);
+
+            result->found = true;
+            // since we are doing Add and need to do subtract window size is stored as negative value
+            result->window_size = -window_size_vec[0];
+            result->mask_input = input;
+
+            return true;
+        };
+        register_matcher(std::make_shared<opp::Matcher>(bwand_res, "GemmaSlidingMask"), std::move(callback));
+    }
+};
+
 namespace {
 uint32_t align_to(uint32_t value, uint32_t alignment) {
     return (value + alignment - 1) & ~(alignment - 1);
@@ -468,6 +557,8 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
         ov::PartialShape new_shape;
         if (input_name.find("input_ids") != std::string::npos) {
             new_shape = ov::PartialShape({1, input_size});
+        } else if (input_name.find("token_type_ids") != std::string::npos) {
+            new_shape = ov::PartialShape({1, input_size});
         } else if (input_name.find("inputs_embeds") != std::string::npos) {
             // NB: VLMs case, model accepts inputs_embeds[BATCH, SEQ_LEN, EMB_SIZE]
             NPUW_ASSERT(input.get_partial_shape().size() == 3u);
@@ -785,6 +876,41 @@ void ov::npuw::LLMCompiledModel::convert_stateful_lora_to_stateless(std::shared_
     model->add_parameters(new_parameters);
 }
 
+void ov::npuw::LLMCompiledModel::gemma_transformations(const std::shared_ptr<ov::Model>& model) {
+    // For now only do transformations for gemma3 which has token_type_ids input.
+    bool token_type_ids_found = false;
+    for (const auto& input : model->inputs()) {
+        const auto& input_name = input.get_any_name();
+        if (input_name.find("token_type_ids") != std::string::npos) {
+            token_type_ids_found = true;
+            break;
+        }
+    }
+
+    if (token_type_ids_found) {
+        ov::pass::GraphRewrite rewr;
+        auto RewrRes = std::make_unique<GemmaSlidingMask::Result>();
+        rewr.add_matcher<GemmaSlidingMask>(RewrRes.get());
+        rewr.run_on_model(model);
+
+        if (RewrRes->found) {
+            OPENVINO_ASSERT(
+                RewrRes->window_size > 0,
+                "Gemma sliding window size must be strictly positive, but got " + std::to_string(RewrRes->window_size));
+
+            m_gemma_sliding_window_size = RewrRes->window_size;
+            auto mask_input = RewrRes->mask_input;
+            model->add_parameters({mask_input});
+            for (auto&& input : model->inputs()) {
+                if (input.get_node() == mask_input.get()) {
+                    input.set_names({mask_input->get_friendly_name()});
+                }
+            }
+            model->validate_nodes_and_infer_types();
+        }
+    }
+}
+
 ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
                                              const std::shared_ptr<const ov::IPlugin>& plugin,
                                              const ov::AnyMap& properties)
@@ -910,6 +1036,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
                       m_kvcache_desc.total_size,
                       axes,
                       m_max_lora_rank);
+    gemma_transformations(kvcache_model);
+
     if (lm_head_model) {
         LOG_DEBUG("Shared LM head: slice the prefill output");
         // KVCache model is already reshaped to [1, max_generation_token_len, embed size],
@@ -1147,6 +1275,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
         write(model_stream, m_prefill_chunk_size);
         write(model_stream, m_use_chunk_prefill);
         write(model_stream, m_max_lora_rank);
+        write(model_stream, m_gemma_sliding_window_size);
 
         // Write config
         write(model_stream, m_cfg);
@@ -1357,6 +1486,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
         read(model_stream, compiled->m_prefill_chunk_size);
         read(model_stream, compiled->m_use_chunk_prefill);
         read(model_stream, compiled->m_max_lora_rank);
+        read(model_stream, compiled->m_gemma_sliding_window_size);
 
         // Deserialize config
         read(model_stream, compiled->m_cfg);
 
@@ -80,6 +80,9 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
     // Support LoRA
     void convert_stateful_lora_to_stateless(std::shared_ptr<ov::Model>& model);
     uint32_t m_max_lora_rank = 32;
+
+    void gemma_transformations(const std::shared_ptr<ov::Model>& model);
+    int32_t m_gemma_sliding_window_size = 0;
 };
 
 }  // namespace npuw
 
@@ -274,6 +274,27 @@ std::pair<uint32_t, uint32_t> get_lora_dims_by_name(const std::string& state_nam
     return std::make_pair(low_rank_dim, full_rank_dim);
 }
 
+void copy_to_right(const ov::SoPtr<ov::ITensor>& src, const ov::SoPtr<ov::ITensor>& dst) {
+    OPENVINO_ASSERT(src->get_byte_size() <= dst->get_byte_size());
+    std::copy_n(reinterpret_cast<uint8_t*>(src->data()),
+                src->get_byte_size(),
+                reinterpret_cast<uint8_t*>(dst->data()) + dst->get_byte_size() - src->get_byte_size());
+}
+
+void fill_sliding_mask(const ov::SoPtr<ov::ITensor>& mask, int64_t curr_pos, int64_t window_size) {
+    auto start = curr_pos - window_size;
+    auto end = curr_pos;
+
+    auto* mask_data = mask->data<bool>();
+    for (int64_t i = 0; i < static_cast<int64_t>(mask->get_size()); ++i) {
+        // Unlike original subgraph which do i <= end we are excluding end
+        // as it is a new token and is located in last position of mask buffer
+        mask_data[i] = i > start && i < end;
+    }
+
+    mask_data[mask->get_size() - 1] = true;
+}
+
 constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1;
 
 constexpr std::size_t kStartOutputKVCacheLayers = 1;
@@ -380,6 +401,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
     }
 
     m_generate_initialized = false;
+    m_gemma_sliding_window_size = compiled_model->m_gemma_sliding_window_size;
 }
 
 void ov::npuw::LLMInferRequest::init_tensor(const ov::Output<const ov::Node>& port) {
@@ -498,6 +520,10 @@ void ov::npuw::LLMInferRequest::apply_lora() {
 
 void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
     fill_tensor_bytes(m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name)), 0u);
+    if (auto type_ids_port = m_prefill_in_ports.find(layer_names::token_type_ids);
+        type_ids_port != m_prefill_in_ports.end()) {
+        fill_tensor_bytes(m_prefill_request->get_tensor(type_ids_port->second), 0u);
+    }
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)), 0);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)), 0);
     m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
@@ -586,8 +612,8 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
 
 void ov::npuw::LLMInferRequest::update_kvcache_for(
     std::shared_ptr<ov::IAsyncInferRequest> request,
-    std::unordered_map<std::string, ov::Output<const ov::Node>> in_ports,
-    std::unordered_map<std::string, ov::Output<const ov::Node>> out_ports,
+    const std::unordered_map<std::string, ov::Output<const ov::Node>>& in_ports,
+    const std::unordered_map<std::string, ov::Output<const ov::Node>>& out_ports,
     uint32_t num_tokens,
     bool v_transposed) {
     LOG_DEBUG("Store computed key and values for passed number of tokens in the input kv-cache"
@@ -750,7 +776,8 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
 
 void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input_ids,
                                                     ov::SoPtr<ov::ITensor> attention_mask,
-                                                    ov::SoPtr<ov::ITensor> position_ids) {
+                                                    ov::SoPtr<ov::ITensor> position_ids,
+                                                    ov::SoPtr<ov::ITensor> token_type_ids) {
     LOG_DEBUG("Calling inference for prefill model in a single launch.");
     LOG_BLOCK();
 
@@ -767,6 +794,13 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
         attention_mask->get_size(),
         padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() - attention_mask->get_size());
 
+    if (token_type_ids) {
+        auto padded_token_type_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids));
+
+        std::fill_n(reinterpret_cast<uint8_t*>(padded_token_type_ids->data()), token_type_ids->get_byte_size(), 0);
+        copy_to_right(token_type_ids, padded_token_type_ids);
+    }
+
     auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids));
     pad_position_ids(padded_position_ids, position_ids);
 
@@ -779,7 +813,8 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
 
 void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
                                               ov::SoPtr<ov::ITensor> attention_mask,
-                                              ov::SoPtr<ov::ITensor> position_ids) {
+                                              ov::SoPtr<ov::ITensor> position_ids,
+                                              ov::SoPtr<ov::ITensor> token_type_ids) {
     LOG_DEBUG("Calling inference for prefill model...");
     LOG_BLOCK();
 
@@ -795,9 +830,12 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
 
     const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
     if (use_chunk_prefill) {
+        OPENVINO_ASSERT(m_gemma_sliding_window_size == 0,
+                        "Chunking is not implemented for Gemma model family yet. "
+                        "Please use set NPUW_LLM_PREFILL_HINT to 'STATIC'");
         infer_chunked_prefill(input_ids, attention_mask, position_ids);
     } else {
-        infer_whole_prefill(input_ids, attention_mask, position_ids);
+        infer_whole_prefill(input_ids, attention_mask, position_ids, token_type_ids);
     }
 
     if (m_lm_head_request) {
@@ -815,7 +853,8 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
 
 void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
                                                ov::SoPtr<ov::ITensor> attention_mask,
-                                               ov::SoPtr<ov::ITensor> position_ids) {
+                                               ov::SoPtr<ov::ITensor> position_ids,
+                                               ov::SoPtr<ov::ITensor> token_type_ids) {
     LOG_DEBUG("Calling inference for generate model...");
     LOG_BLOCK();
     auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
@@ -834,6 +873,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         fill_tensor_bytes(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)), 0u);
         fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)), 0);
         fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)), 0);
+        if (token_type_ids) {
+            fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids)), 0);
+        }
         m_generate_initialized = true;
     }
 
@@ -842,6 +884,14 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         OPENVINO_THROW("KV-Cache is full.");
     }
 
+    if (auto sliding_mask_port = m_kvcache_in_ports.find(layer_names::gemma_sliding_mask);
+        sliding_mask_port != m_kvcache_in_ports.end()) {
+        // TODO: Fill once and update on each iteration instead
+        fill_sliding_mask(m_kvcache_request->get_tensor(sliding_mask_port->second),
+                          kvcache_desc.num_stored_tokens + input_tokens_len,
+                          m_gemma_sliding_window_size);
+    }
+
     // FIXME: these tensors should be shared between the parent & child models
     // NB: input_ids can be either fp32(VLM) or i64(LLM)
     auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
@@ -854,6 +904,11 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         input_ids->get_byte_size(),
         reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size());
 
+    if (token_type_ids) {
+        auto kv_token_type_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids));
+        copy_to_right(token_type_ids, kv_token_type_ids);
+    }
+
     // NOTE: Attention mask pattern for generate model requires the set of "1"
     //       units of length of the current prompt on the right (for present
     //       kv layers) and the set of "1" units of number of previously calculated
@@ -912,12 +967,28 @@ void ov::npuw::LLMInferRequest::infer() {
     // FIXME: position_ids might be optional for some models!
     auto position_ids = get_tensor(find_port_by_name(inputs, layer_names::position_ids).value());
 
+    auto token_type_ids = ov::npuw::util::TensorPtr();
+
+    if (auto type_ids_port = find_port_by_name(inputs, layer_names::token_type_ids); type_ids_port.has_value()) {
+        token_type_ids = get_tensor(type_ids_port.value());
+    }
+
     // NB: For VLM, the "inputs_embeds" contains float values (embeddings)
     OPENVINO_ASSERT(ov::element::f32 == input_ids->get_element_type() ||
                     ov::element::i64 == input_ids->get_element_type());
     OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type());
     OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type());
 
+    if (m_first_run) {
+        // Most of the models have position_ids->data<int64_t>()[0] == 0 for the first infer
+        // But gemma3 has it == 1
+        // We need to store original first position id in order to distinguish between prefill and generate stage
+        // While in most of the cases we need to do prefill only once, it is not true for chat mode
+        // where we need to do prefill on each user input.
+        m_first_position_id = position_ids->data<int64_t>()[0];
+        m_first_run = false;
+    }
+
     // NB: Check the sequence length provided for input_ids
     //     and start position idx in order to distinguish prefill
     //     and generate stages.
@@ -940,11 +1011,11 @@ void ov::npuw::LLMInferRequest::infer() {
     // The outcome of two items is that prefill and generate stages
     //    can be safely differentiated by start position id for
     //    both main and draft models.
-    if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data<int64_t>()[0] == 0) {
-        infer_prefill(input_ids, attention_mask, position_ids);
+    if (input_ids->get_shape()[INPUT_IDS_SEQ_LEN_DIM] > 1 && position_ids->data<int64_t>()[0] == m_first_position_id) {
+        infer_prefill(input_ids, attention_mask, position_ids, token_type_ids);
     } else {
         trim_kvcache_for_speculative_decoding(position_ids);
-        infer_generate(input_ids, attention_mask, position_ids);
+        infer_generate(input_ids, attention_mask, position_ids, token_type_ids);
     }
 }