openvinotoolkit
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp‎
Lines changed: 59 additions & 0 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp‎
Lines changed: 52 additions & 0 deletions b/‎src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/al/src/config/npuw.cpp‎
Lines changed: 4 additions & 0 deletions b/‎src/plugins/intel_npu/src/al/src/config/npuw.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp‎
Lines changed: 12 additions & 0 deletions b/‎src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp‎
Lines changed: 12 additions & 0 deletions
@@ -118,6 +118,8 @@ DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime);
 DEFINE_OPT(NPUW_F16IC, bool, true, npuw::partitioning::f16_interconnect, RunTime);
 DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, RunTime);
 DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, RunTime);
+DEFINE_OPT(NPUW_MOE_TOKEN_CHUNK_SIZE, uint64_t, 0, npuw::partitioning::moe_token_chunk_size, RunTime);
+DEFINE_OPT(NPUW_MOE_POOL_SIZE, std::size_t, 8, npuw::partitioning::moe_pool_size, RunTime);
 DEFINE_OPT(NPUW_ATTN, std::string, "STATIC", npuw::partitioning::attn, RunTime);
 DEFINE_OPT(NPUW_ATTN_DYN, bool, true, npuw::partitioning::attn_dyn, RunTime);
 DEFINE_OPT(NPUW_ATTN_NO_COPY, bool, false, npuw::partitioning::attn_no_copy, RunTime);
@@ -172,6 +174,7 @@ namespace llm {
 enum class PrefillHint { DYNAMIC, STATIC };
 enum class GenerateHint { FAST_COMPILE, BEST_PERF };
 enum class AttentionHint { DYNAMIC, STATIC, PYRAMID, HFA };
+enum class MoEHint { DENSE, HOST_ROUTED, DEVICE_ROUTED };
 }  // namespace llm
 }  // namespace npuw
 
@@ -279,6 +282,62 @@ struct NPUW_LLM_PREFILL_ATTENTION_HINT final : ATTN_HINT_BASE {
     }
 };
 
+struct MOE_HINT_BASE : OptionBase<MOE_HINT_BASE, ::intel_npu::npuw::llm::MoEHint> {
+    static constexpr std::string_view getTypeName() {
+        return "::intel_npu::npuw::llm::MoEHint";
+    }
+
+    static ::intel_npu::npuw::llm::MoEHint defaultValue() {
+        return ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED;
+    }
+
+    static ::intel_npu::npuw::llm::MoEHint parse(std::string_view val) {
+        if (val == "DENSE") {
+            return ::intel_npu::npuw::llm::MoEHint::DENSE;
+        } else if (val == "HOST_ROUTED") {
+            return ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED;
+        } else if (val == "DEVICE_ROUTED") {
+            return ::intel_npu::npuw::llm::MoEHint::DEVICE_ROUTED;
+        }
+        OPENVINO_THROW("Unsupported MoE hint provided: ", val);
+        return {};
+    }
+
+    static std::string toString(const ::intel_npu::npuw::llm::MoEHint& val) {
+        switch (val) {
+        case ::intel_npu::npuw::llm::MoEHint::DENSE:
+            return "DENSE";
+        case ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED:
+            return "HOST_ROUTED";
+        case ::intel_npu::npuw::llm::MoEHint::DEVICE_ROUTED:
+            return "DEVICE_ROUTED";
+        default:
+            OPENVINO_THROW("Can't convert provided MoE hint : ", int(val), " to string.");
+        }
+        return {};
+    }
+
+    static OptionMode mode() {
+        return OptionMode::RunTime;
+    }
+
+    static bool isPublic() {
+        return false;
+    }
+};
+
+struct NPUW_LLM_PREFILL_MOE_HINT final : MOE_HINT_BASE {
+    static std::string_view key() {
+        return ov::intel_npu::npuw::llm::prefill_moe_hint.name();
+    }
+};
+
+struct NPUW_LLM_GENERATE_MOE_HINT final : MOE_HINT_BASE {
+    static std::string_view key() {
+        return ov::intel_npu::npuw::llm::generate_moe_hint.name();
+    }
+};
+
 struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
     static std::string_view key() {
         return ov::intel_npu::npuw::llm::generate_hint.name();
 
@@ -248,6 +248,39 @@ static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
  */
 static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
 
+/**
+ * @brief
+ * Type: uint64_t.
+ * MoE expert model compilation strategy for prefill stage token chunking.
+ *
+ * When set to 0 (default): Compiles multiple expert models with different chunk sizes
+ * {16, 32, 64, 128, 256} for dynamic chunk selection at runtime. This provides optimal
+ * performance by selecting the best chunk size based on remaining tokens.
+ *
+ * When set to a specific value (e.g., 128, 256): Compiles only a single expert model
+ * with the specified fixed chunk size. This reduces compilation time and memory usage but
+ * may not be optimal for all token counts.
+ *
+ * The chunk size should be a power of two for best hardware utilization.
+ * Note: This only affects prefill stage (multi-token inference). Decoding stage (single token)
+ * always uses a dedicated model regardless of this setting.
+ *
+ * Default value: 0 (dynamic multi-model compilation).
+ */
+static constexpr ov::Property<uint64_t> moe_token_chunk_size{"NPUW_MOE_TOKEN_CHUNK_SIZE"};
+
+/**
+ * @brief Configure MoE request pool size per layer for caching expert configurations.
+ *
+ * Controls the number of pre-allocated inference requests per MoE layer to cache
+ * different expert combinations. Using LRU eviction when pool is full.
+ * Setting to 0 disables the MoE request cache entirely.
+ *
+ * Type: std::size_t.
+ * Default value: 8 (cache up to 8 expert configurations per layer).
+ */
+static constexpr ov::Property<std::size_t> moe_pool_size{"NPUW_MOE_POOL_SIZE"};
+
 /**
  * @brief
  * Type: std::string.
@@ -433,6 +466,7 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
  * Default value: false.
  */
 static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
+
 }  // namespace dump
 
 namespace llm {
@@ -511,6 +545,24 @@ static constexpr ov::Property<bool> optimize_v_tensors{"NPUW_LLM_OPTIMIZE_V_TENS
  */
 static constexpr ov::Property<bool> cache_rope{"NPUW_LLM_CACHE_ROPE"};
 
+/**
+ * @brief
+ * Type: ::intel_npu::npuw::llm::MoEHint
+ * Specify MoE (Mixture of Experts) implementation strategy for prefill stage
+ * Possible values: DENSE, HOST_ROUTED, DEVICE_ROUTED
+ * Default value: HOST_ROUTED (recommended for prefill to avoid NPU-unfriendly operations)
+ */
+static constexpr ov::Property<std::string> prefill_moe_hint{"NPUW_LLM_PREFILL_MOE_HINT"};
+
+/**
+ * @brief
+ * Type: ::intel_npu::npuw::llm::MoEHint
+ * Specify MoE (Mixture of Experts) implementation strategy for generate/decoding stage
+ * Possible values: DENSE, HOST_ROUTED, DEVICE_ROUTED
+ * Default value: HOST_ROUTED (DEVICE_ROUTED recommended for better decoding performance)
+ */
+static constexpr ov::Property<std::string> generate_moe_hint{"NPUW_LLM_GENERATE_MOE_HINT"};
+
 /**
  * @brief
  * Type: boolean
 
@@ -33,6 +33,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_SPATIAL>();
     desc.add<NPUW_SPATIAL_NWAY>();
     desc.add<NPUW_SPATIAL_DYN>();
+    desc.add<NPUW_MOE_TOKEN_CHUNK_SIZE>();
+    desc.add<NPUW_MOE_POOL_SIZE>();
     desc.add<NPUW_ATTN>();
     desc.add<NPUW_ATTN_DYN>();
     desc.add<NPUW_ATTN_NO_COPY>();
@@ -69,6 +71,8 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM_MAX_LORA_RANK>();
     desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
     desc.add<NPUW_LLM_CACHE_ROPE>();
+    desc.add<NPUW_LLM_PREFILL_MOE_HINT>();
+    desc.add<NPUW_LLM_GENERATE_MOE_HINT>();
     desc.add<NPUW_LLM_GENERATE_PYRAMID>();
     desc.add<NPUW_LLM_PREFILL_CHUNK_SIZE>();
     desc.add<NPUW_LLM_ENABLE_PREFIX_CACHING>();
 
@@ -420,6 +420,12 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request)
     const auto real_idx = comp_model_desc.replaced_by.value();
     auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
 
+    // Skip MoE expert submodels - MoE experts require special unpacking logic according to the
+    // expert selection, which is handled later in the inference flow.
+    if (func_desc.moe_experts.has_value()) {
+        return;
+    }
+
     // Bind extra parameters from the function's closure
     // First, do easy things & delay heavy stuff
     std::vector<std::size_t> closure_unpack_required;
@@ -506,6 +512,12 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
     const auto& iodesc = m_subrequests_gio.at(idx);
 
     const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+
+    if (proto_comp_model_desc.moe_experts.has_value()) {
+        // Expert submodel does not have global parameters to bind
+        return;
+    }
+
     const bool is_spatial = proto_comp_model_desc.spatial.has_value();
     const bool is_attention = proto_comp_model_desc.attention.has_value();
     const bool is_pyramid_attention = proto_comp_model_desc.pyramid_attention.has_value();