Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
- attn_impl: attention类型,可选项为'sdpa', 'eager', 'flash_attn', 'flash_attention_2', 'flash_attention_3'等。默认使用None,读取'config.json'。
- 注意:这几种attention实现并不一定都支持,这取决于对应模型transformers实现的支持情况。
- 若设置为'flash_attn'(兼容旧版本),则使用'flash_attention_2'。
- experts_impl: 专家实现类型,可选项为'grouped_mm', 'batched_mm', 'eager'。默认为None。该特性需要"transformers>=5.0.0"。
- 🔥experts_impl: 专家实现类型,可选项为'grouped_mm', 'batched_mm', 'eager'。默认为None。该特性需要"transformers>=5.0.0"。
- new_special_tokens: 需要新增的特殊tokens。默认为`[]`。例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens)
- 注意:你也可以传入以`.txt`结尾的文件路径,每行为一个special token。
- num_labels: 分类模型(即`--task_type seq_cls`)需要指定该参数。代表标签数量,默认为None。
Expand Down
2 changes: 1 addition & 1 deletion docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ The command-line arguments will be introduced in four categories: basic argument
- attn_impl: Attention implementation. Options include `'sdpa'`, `'eager'`, `'flash_attn'`, `'flash_attention_2'`, `'flash_attention_3'`, etc. Default is `None`, reading from config.json.
- Note: Not all attention implementations may be supported, depending on the underlying Transformers library's support for the specific model.
- If set to `'flash_attn'` (for backward compatibility), `'flash_attention_2'` will be used.
- experts_impl: Expert implementation type, options are 'grouped_mm', 'batched_mm', 'eager'. Defaults to None. This feature requires "transformers>=5.0.0".
- 🔥experts_impl: Expert implementation type, options are 'grouped_mm', 'batched_mm', 'eager'. Defaults to None. This feature requires "transformers>=5.0.0".
- new_special_tokens: List of additional special tokens to be added. Default is `[]`. Example usage can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/new_special_tokens).
- Note: You can also pass a `.txt` file path where each line contains one special token.
- num_labels: Required for classification models (`--task_type seq_cls`). Indicates the number of labels. Default is `None`.
Expand Down
4 changes: 2 additions & 2 deletions swift/arguments/base_args/base_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def get_model_processor(self,
*,
model=None,
model_type=None,
model_revision=None,
revision=None,
task_type=None,
num_labels=None,
**kwargs):
Expand All @@ -319,7 +319,7 @@ def get_model_processor(self,
# compat rlhf
res['model_id_or_path'] = model or self.model
res['model_type'] = model_type or self.model_type
res['model_revision'] = model_revision or self.model_revision
res['revision'] = revision or self.model_revision
res['task_type'] = task_type or self.task_type
res['num_labels'] = num_labels or self.num_labels

Expand Down
36 changes: 25 additions & 11 deletions swift/megatron/model/gpt_bridge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
import math
import megatron.core
import re
import torch
import torch.distributed as dist
import torch.nn.functional as F
Expand Down Expand Up @@ -736,11 +737,10 @@ def _set_moe_state(
def _get_hf_grouped(self, is_mtp_layer: bool = False):
if self.model_type in {
'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe',
'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe'
'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe',
'qwen3_5_moe'
}:
return False, False
elif self.model_type == 'qwen3_5_moe' and is_mtp_layer:
return False, False
return None, None

def _get_transpose(self):
Expand All @@ -760,32 +760,46 @@ def _set_mlp_state(
hf_mlp=None,
is_mtp_layer: bool = False,
):
if to_mcore:
hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
if hf_mlp is None:
hf_mlp = self._get_hf_mlp(layer_idx)
is_expert = ep_rank is not None
num_local_experts = 1
hf_grouped = False
config = self.config
if is_expert:
hf_grouped = not hasattr(hf_mlp.experts, '__len__')
hf_mlp = hf_mlp.experts if hf_grouped else hf_mlp.experts[0]
hf_mlp = hf_mlp.experts
# When converting to_mcore, hf_grouped is determined by default from the hf_state_dict condition.
# When converting to_hf, it is determined by default from the hf_mlp condition.
if to_mcore:
pattern = r'\d+\.down_proj'
hf_grouped = not any(re.match(pattern, k) is not None for k in hf_state_dict.keys())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using regex to detect separate experts in the state dict is a robust way to determine hf_grouped when loading weights. This is more reliable than relying on hardcoded model lists. You can simplify the generator expression slightly for better readability.

References
  1. Iterating directly over a dictionary iterates over its keys, and match objects are truthy in Python.

else:
hf_grouped = not hasattr(hf_mlp, '__len__')
if hasattr(hf_mlp, '__len__'):
hf_mlp = hf_mlp[0]
num_local_experts = config.num_moe_experts // self.ep_size
is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
if to_mcore:
is_gate_up = any('gate_up_proj' in k for k in hf_state_dict.keys())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Detecting is_gate_up by checking for the presence of the string in the state dict keys is a good improvement for robustness when to_mcore is True. The .keys() call can be omitted for brevity.

References
  1. Iterating directly over a dictionary iterates over its keys.

else:
is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
# transformers 5.0 compatibility
if self.is_transformers_5:
if self.is_transformers_5 and not to_mcore and is_expert:
_hf_grouped, _is_gate_up = self._get_hf_grouped(is_mtp_layer)
if _hf_grouped is not None:
hf_grouped = _hf_grouped
if _is_gate_up is not None:
is_gate_up = _is_gate_up
need_transpose = True
if self.is_transformers_5:
if self.is_transformers_5 and hf_grouped:
need_transpose = self._get_transpose()

if to_mcore or hf_grouped:
if hf_grouped and not to_mcore:
hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
else:
elif not to_mcore:
hf_state_dict = {}

# linear_fc1
if to_mcore:
has_scale_inv = any('_scale_inv' in k for k in hf_state_dict.keys())
Expand Down Expand Up @@ -1623,7 +1637,7 @@ def save_weights(self,
config = self.config
if config.mtp_num_layers:
hf_config.num_nextn_predict_layers = config.mtp_num_layers
if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param_gather:
if config.fp8 is not None and config.fp8_recipe == 'blockwise' and config.fp8_param:
if getattr(hf_config, 'quantization_config', None) is None:
from transformers.utils.quantization_config import FineGrainedFP8Config
modules_to_not_convert = get_modules_to_not_convert(self.hf_model)
Expand Down
2 changes: 1 addition & 1 deletion swift/pipelines/train/rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _prepare_single_model(self, key, origin_key, model_type, model_revision):
model, processor = args.get_model_processor(
model=model_id_or_path,
model_type=model_type,
model_revision=model_revision,
revision=model_revision,
task_type=task_type,
num_labels=num_labels)

Expand Down
Loading