Skip to content

Commit 4eadb95

Browse files
[NPUW]Enable MoE (GPT-OSS-20B) on NPU - HOST_ROUTED. (#33372)
### Details: Enable GPT-OSS-20B on NPW. **Prefill:** Subgraphs isolation: 3 graphs per REPEAT (Router - Experts - Downstream) Transformation for experts: E_NUM x SEQ_LEN -> 1 x chunk_size (chunk_size are 16/32/64/128/256) Execution: Process multiple tokens by iterating through experts sequentially. - For each expert: process all tokens assigned to it (potentially in chunks) - Use dynamic chunk sizing (256/128/64/32/16) based on token count - Accumulate expert outputs into global buffer **Decoding:** Subgraphs isolation: 3 graphs per REPEAT (Router - Experts - Downstream) Transformation for experts: E_NUM x 1 -> E_ACT_NUM x 1 -> unroll active experts for better performance Execution: Process one token with K active experts in a single batched inference. - Set all K expert weights at once (batch unrolling) - Set K router scores (one per expert) - Execute single inference that processes all K experts in on submission **File structure:** ``` src/plugins/intel_npu/src/plugin/npuw/ ├── moe/ # MoE module (NEW) │ ├── moe_config.hpp # Configuration data structures │ ├── moe_types.hpp # Type definitions (MoEIO) │ ├── moe_infer_utils.hpp/cpp # Utility functions & RequestCache │ ├── moe_resources.hpp/cpp # Resource management │ └── moe_executor.hpp/cpp # Core execution logic ├── just_sync_infer_request.hpp/cpp # Integration point ├── compiled_model.hpp # Model metadata (CompiledModelDesc) ├── moe_transformations/ # MoE compilation and transformations ├── moe_transformation.hpp/cpp └── moe_unroll_patterns.hpp/cpp ``` The model can be validate with below config: ``` { "NPUW_DEVICES" : "NPU", "MAX_PROMPT_LEN" : 1024, "NPUW_MOE_TOKEN_CHUNK_SIZE" : 0, "NPUW_LLM_GENERATE_MOE_HINT" : "HOST_ROUTED", "NPUW_F16IC" : "YES", "NPUW_LLM_OPTIMIZE_V_TENSORS" : "YES", "NPU_TURBO" : "YES", "NPUW_DUMP_SUBS" : "YES", "NPUW_DUMP_IO" : "NO", "NPUW_MOE_POOL_SIZE" : 8 } ``` ### Tickets: - *[EISW-190615](https://jira.devtools.intel.com/browse/EISW-190615)* --------- Signed-off-by: intelgaoxiong <xiong.gao@intel.com>
1 parent 367ad56 commit 4eadb95

34 files changed

+6488
-38
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime);
118118
DEFINE_OPT(NPUW_F16IC, bool, true, npuw::partitioning::f16_interconnect, RunTime);
119119
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, RunTime);
120120
DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, RunTime);
121+
DEFINE_OPT(NPUW_MOE_TOKEN_CHUNK_SIZE, uint64_t, 0, npuw::partitioning::moe_token_chunk_size, RunTime);
122+
DEFINE_OPT(NPUW_MOE_POOL_SIZE, std::size_t, 8, npuw::partitioning::moe_pool_size, RunTime);
121123
DEFINE_OPT(NPUW_ATTN, std::string, "STATIC", npuw::partitioning::attn, RunTime);
122124
DEFINE_OPT(NPUW_ATTN_DYN, bool, true, npuw::partitioning::attn_dyn, RunTime);
123125
DEFINE_OPT(NPUW_ATTN_NO_COPY, bool, false, npuw::partitioning::attn_no_copy, RunTime);
@@ -172,6 +174,7 @@ namespace llm {
172174
enum class PrefillHint { DYNAMIC, STATIC };
173175
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
174176
enum class AttentionHint { DYNAMIC, STATIC, PYRAMID, HFA };
177+
enum class MoEHint { DENSE, HOST_ROUTED, DEVICE_ROUTED };
175178
} // namespace llm
176179
} // namespace npuw
177180

@@ -279,6 +282,62 @@ struct NPUW_LLM_PREFILL_ATTENTION_HINT final : ATTN_HINT_BASE {
279282
}
280283
};
281284

285+
struct MOE_HINT_BASE : OptionBase<MOE_HINT_BASE, ::intel_npu::npuw::llm::MoEHint> {
286+
static constexpr std::string_view getTypeName() {
287+
return "::intel_npu::npuw::llm::MoEHint";
288+
}
289+
290+
static ::intel_npu::npuw::llm::MoEHint defaultValue() {
291+
return ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED;
292+
}
293+
294+
static ::intel_npu::npuw::llm::MoEHint parse(std::string_view val) {
295+
if (val == "DENSE") {
296+
return ::intel_npu::npuw::llm::MoEHint::DENSE;
297+
} else if (val == "HOST_ROUTED") {
298+
return ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED;
299+
} else if (val == "DEVICE_ROUTED") {
300+
return ::intel_npu::npuw::llm::MoEHint::DEVICE_ROUTED;
301+
}
302+
OPENVINO_THROW("Unsupported MoE hint provided: ", val);
303+
return {};
304+
}
305+
306+
static std::string toString(const ::intel_npu::npuw::llm::MoEHint& val) {
307+
switch (val) {
308+
case ::intel_npu::npuw::llm::MoEHint::DENSE:
309+
return "DENSE";
310+
case ::intel_npu::npuw::llm::MoEHint::HOST_ROUTED:
311+
return "HOST_ROUTED";
312+
case ::intel_npu::npuw::llm::MoEHint::DEVICE_ROUTED:
313+
return "DEVICE_ROUTED";
314+
default:
315+
OPENVINO_THROW("Can't convert provided MoE hint : ", int(val), " to string.");
316+
}
317+
return {};
318+
}
319+
320+
static OptionMode mode() {
321+
return OptionMode::RunTime;
322+
}
323+
324+
static bool isPublic() {
325+
return false;
326+
}
327+
};
328+
329+
struct NPUW_LLM_PREFILL_MOE_HINT final : MOE_HINT_BASE {
330+
static std::string_view key() {
331+
return ov::intel_npu::npuw::llm::prefill_moe_hint.name();
332+
}
333+
};
334+
335+
struct NPUW_LLM_GENERATE_MOE_HINT final : MOE_HINT_BASE {
336+
static std::string_view key() {
337+
return ov::intel_npu::npuw::llm::generate_moe_hint.name();
338+
}
339+
};
340+
282341
struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
283342
static std::string_view key() {
284343
return ov::intel_npu::npuw::llm::generate_hint.name();

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,39 @@ static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
248248
*/
249249
static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
250250

251+
/**
252+
* @brief
253+
* Type: uint64_t.
254+
* MoE expert model compilation strategy for prefill stage token chunking.
255+
*
256+
* When set to 0 (default): Compiles multiple expert models with different chunk sizes
257+
* {16, 32, 64, 128, 256} for dynamic chunk selection at runtime. This provides optimal
258+
* performance by selecting the best chunk size based on remaining tokens.
259+
*
260+
* When set to a specific value (e.g., 128, 256): Compiles only a single expert model
261+
* with the specified fixed chunk size. This reduces compilation time and memory usage but
262+
* may not be optimal for all token counts.
263+
*
264+
* The chunk size should be a power of two for best hardware utilization.
265+
* Note: This only affects prefill stage (multi-token inference). Decoding stage (single token)
266+
* always uses a dedicated model regardless of this setting.
267+
*
268+
* Default value: 0 (dynamic multi-model compilation).
269+
*/
270+
static constexpr ov::Property<uint64_t> moe_token_chunk_size{"NPUW_MOE_TOKEN_CHUNK_SIZE"};
271+
272+
/**
273+
* @brief Configure MoE request pool size per layer for caching expert configurations.
274+
*
275+
* Controls the number of pre-allocated inference requests per MoE layer to cache
276+
* different expert combinations. Using LRU eviction when pool is full.
277+
* Setting to 0 disables the MoE request cache entirely.
278+
*
279+
* Type: std::size_t.
280+
* Default value: 8 (cache up to 8 expert configurations per layer).
281+
*/
282+
static constexpr ov::Property<std::size_t> moe_pool_size{"NPUW_MOE_POOL_SIZE"};
283+
251284
/**
252285
* @brief
253286
* Type: std::string.
@@ -433,6 +466,7 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
433466
* Default value: false.
434467
*/
435468
static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
469+
436470
} // namespace dump
437471

438472
namespace llm {
@@ -511,6 +545,24 @@ static constexpr ov::Property<bool> optimize_v_tensors{"NPUW_LLM_OPTIMIZE_V_TENS
511545
*/
512546
static constexpr ov::Property<bool> cache_rope{"NPUW_LLM_CACHE_ROPE"};
513547

548+
/**
549+
* @brief
550+
* Type: ::intel_npu::npuw::llm::MoEHint
551+
* Specify MoE (Mixture of Experts) implementation strategy for prefill stage
552+
* Possible values: DENSE, HOST_ROUTED, DEVICE_ROUTED
553+
* Default value: HOST_ROUTED (recommended for prefill to avoid NPU-unfriendly operations)
554+
*/
555+
static constexpr ov::Property<std::string> prefill_moe_hint{"NPUW_LLM_PREFILL_MOE_HINT"};
556+
557+
/**
558+
* @brief
559+
* Type: ::intel_npu::npuw::llm::MoEHint
560+
* Specify MoE (Mixture of Experts) implementation strategy for generate/decoding stage
561+
* Possible values: DENSE, HOST_ROUTED, DEVICE_ROUTED
562+
* Default value: HOST_ROUTED (DEVICE_ROUTED recommended for better decoding performance)
563+
*/
564+
static constexpr ov::Property<std::string> generate_moe_hint{"NPUW_LLM_GENERATE_MOE_HINT"};
565+
514566
/**
515567
* @brief
516568
* Type: boolean

src/plugins/intel_npu/src/al/src/config/npuw.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
3333
desc.add<NPUW_SPATIAL>();
3434
desc.add<NPUW_SPATIAL_NWAY>();
3535
desc.add<NPUW_SPATIAL_DYN>();
36+
desc.add<NPUW_MOE_TOKEN_CHUNK_SIZE>();
37+
desc.add<NPUW_MOE_POOL_SIZE>();
3638
desc.add<NPUW_ATTN>();
3739
desc.add<NPUW_ATTN_DYN>();
3840
desc.add<NPUW_ATTN_NO_COPY>();
@@ -69,6 +71,8 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
6971
desc.add<NPUW_LLM_MAX_LORA_RANK>();
7072
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
7173
desc.add<NPUW_LLM_CACHE_ROPE>();
74+
desc.add<NPUW_LLM_PREFILL_MOE_HINT>();
75+
desc.add<NPUW_LLM_GENERATE_MOE_HINT>();
7276
desc.add<NPUW_LLM_GENERATE_PYRAMID>();
7377
desc.add<NPUW_LLM_PREFILL_CHUNK_SIZE>();
7478
desc.add<NPUW_LLM_ENABLE_PREFIX_CACHING>();

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,12 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request)
420420
const auto real_idx = comp_model_desc.replaced_by.value();
421421
auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
422422

423+
// Skip MoE expert submodels - MoE experts require special unpacking logic according to the
424+
// expert selection, which is handled later in the inference flow.
425+
if (func_desc.moe_experts.has_value()) {
426+
return;
427+
}
428+
423429
// Bind extra parameters from the function's closure
424430
// First, do easy things & delay heavy stuff
425431
std::vector<std::size_t> closure_unpack_required;
@@ -506,6 +512,12 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
506512
const auto& iodesc = m_subrequests_gio.at(idx);
507513

508514
const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
515+
516+
if (proto_comp_model_desc.moe_experts.has_value()) {
517+
// Expert submodel does not have global parameters to bind
518+
return;
519+
}
520+
509521
const bool is_spatial = proto_comp_model_desc.spatial.has_value();
510522
const bool is_attention = proto_comp_model_desc.attention.has_value();
511523
const bool is_pyramid_attention = proto_comp_model_desc.pyramid_attention.has_value();

0 commit comments

Comments
 (0)