modelscope · Jintao-Huang · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -39,7 +39,7 @@
 - attn_impl: attention类型，可选项为'sdpa', 'eager', 'flash_attn', 'flash_attention_2', 'flash_attention_3'等。默认使用None，读取'config.json'。
   - 注意：这几种attention实现并不一定都支持，这取决于对应模型transformers实现的支持情况。
   - 若设置为'flash_attn'（兼容旧版本），则使用'flash_attention_2'。
-- experts_impl: 专家实现类型，可选项为'grouped_mm', 'batched_mm', 'eager'。默认为None。该特性需要"transformers>=5.0.0"。
+- 🔥experts_impl: 专家实现类型，可选项为'grouped_mm', 'batched_mm', 'eager'。默认为None。该特性需要"transformers>=5.0.0"。
 - new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)。
   - 注意：你也可以传入以`.txt`结尾的文件路径，每行为一个special token。
 - num_labels: 分类模型（即`--task_type seq_cls`）需要指定该参数。代表标签数量，默认为None。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -39,7 +39,7 @@ The command-line arguments will be introduced in four categories: basic argument
 - attn_impl: Attention implementation. Options include `'sdpa'`, `'eager'`, `'flash_attn'`, `'flash_attention_2'`, `'flash_attention_3'`, etc. Default is `None`, reading from config.json.
   - Note: Not all attention implementations may be supported, depending on the underlying Transformers library's support for the specific model.
   - If set to `'flash_attn'` (for backward compatibility), `'flash_attention_2'` will be used.
-- experts_impl: Expert implementation type, options are 'grouped_mm', 'batched_mm', 'eager'. Defaults to None. This feature requires "transformers>=5.0.0".
+- 🔥experts_impl: Expert implementation type, options are 'grouped_mm', 'batched_mm', 'eager'. Defaults to None. This feature requires "transformers>=5.0.0".
 - new_special_tokens: List of additional special tokens to be added. Default is `[]`. Example usage can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens).
   - Note: You can also pass a `.txt` file path where each line contains one special token.
 - num_labels: Required for classification models (`--task_type seq_cls`). Indicates the number of labels. Default is `None`.

diff --git a/swift/arguments/base_args/base_args.py b/swift/arguments/base_args/base_args.py
@@ -308,7 +308,7 @@ def get_model_processor(self,
                             *,
                             model=None,
                             model_type=None,
-                            model_revision=None,
+                            revision=None,
                             task_type=None,
                             num_labels=None,
                             **kwargs):
@@ -319,7 +319,7 @@ def get_model_processor(self,
         # compat rlhf
         res['model_id_or_path'] = model or self.model
         res['model_type'] = model_type or self.model_type
-        res['model_revision'] = model_revision or self.model_revision
+        res['revision'] = revision or self.model_revision
         res['task_type'] = task_type or self.task_type
         res['num_labels'] = num_labels or self.num_labels
 

diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
@@ -1,6 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import math
 import megatron.core
+import re
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
@@ -736,11 +737,10 @@ def _set_moe_state(
     def _get_hf_grouped(self, is_mtp_layer: bool = False):
         if self.model_type in {
                 'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe',
-                'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe'
+                'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe',
+                'qwen3_5_moe'
         }:
             return False, False
-        elif self.model_type == 'qwen3_5_moe' and is_mtp_layer:
-            return False, False
         return None, None
 
     def _get_transpose(self):
@@ -760,32 +760,46 @@ def _set_mlp_state(
         hf_mlp=None,
         is_mtp_layer: bool = False,
     ):
+        if to_mcore:
+            hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
         if hf_mlp is None:
             hf_mlp = self._get_hf_mlp(layer_idx)
         is_expert = ep_rank is not None
         num_local_experts = 1
         hf_grouped = False
         config = self.config
         if is_expert:
-            hf_grouped = not hasattr(hf_mlp.experts, '__len__')
-            hf_mlp = hf_mlp.experts if hf_grouped else hf_mlp.experts[0]
+            hf_mlp = hf_mlp.experts
+            # When converting to_mcore, hf_grouped is determined by default from the hf_state_dict condition.
+            # When converting to_hf, it is determined by default from the hf_mlp condition.
+            if to_mcore:
+                pattern = r'\d+\.down_proj'
+                hf_grouped = not any(re.match(pattern, k) is not None for k in hf_state_dict.keys())
+            else:
+                hf_grouped = not hasattr(hf_mlp, '__len__')
+            if hasattr(hf_mlp, '__len__'):
+                hf_mlp = hf_mlp[0]
             num_local_experts = config.num_moe_experts // self.ep_size
-        is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
+        if to_mcore:
+            is_gate_up = any('gate_up_proj' in k for k in hf_state_dict.keys())
+        else:
+            is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
         # transformers 5.0 compatibility
-        if self.is_transformers_5:
+        if self.is_transformers_5 and not to_mcore and is_expert:
             _hf_grouped, _is_gate_up = self._get_hf_grouped(is_mtp_layer)
             if _hf_grouped is not None:
                 hf_grouped = _hf_grouped
             if _is_gate_up is not None:
                 is_gate_up = _is_gate_up
         need_transpose = True
-        if self.is_transformers_5:
+        if self.is_transformers_5 and hf_grouped:
             need_transpose = self._get_transpose()
 
-        if to_mcore or hf_grouped:
+        if hf_grouped and not to_mcore:
             hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
-        else:
+        elif not to_mcore:
             hf_state_dict = {}
+
         # linear_fc1
         if to_mcore:
             has_scale_inv = any('_scale_inv' in k for k in hf_state_dict.keys())
@@ -1623,7 +1637,7 @@ def save_weights(self,
                 config = self.config
                 if config.mtp_num_layers:
                     hf_config.num_nextn_predict_layers = config.mtp_num_layers
-                if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param_gather:
+                if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param:
                     if getattr(hf_config, 'quantization_config', None) is None:
                         from transformers.utils.quantization_config import FineGrainedFP8Config
                         modules_to_not_convert = get_modules_to_not_convert(self.hf_model)

diff --git a/swift/pipelines/train/rlhf.py b/swift/pipelines/train/rlhf.py
@@ -87,7 +87,7 @@ def _prepare_single_model(self, key, origin_key, model_type, model_revision):
             model, processor = args.get_model_processor(
                 model=model_id_or_path,
                 model_type=model_type,
-                model_revision=model_revision,
+                revision=model_revision,
                 task_type=task_type,
                 num_labels=num_labels)