Skip to content

Commit 7af5b61

Browse files
committed
Fix bug (#5121)
1 parent 00fc1a6 commit 7af5b61

File tree

11 files changed

+39
-31
lines changed

11 files changed

+39
-31
lines changed

docs/source/Instruction/命令行参数.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@
177177
- 🔥warmup_ratio: 默认为0.。
178178
- save_on_each_node: 默认为False。在多机训练时需要被考虑。
179179
- save_only_model: 是否只保存模型权重而不包含优化器状态,随机种子状态等内容。默认为False。
180-
- 🔥resume_from_checkpoint: 断点续训参数,传入checkpoint路径。默认为None。断点续训请保持其他参数不变,额外增加`--resume_from_checkpoint checkpoint_dir`
180+
- 🔥resume_from_checkpoint: 断点续训参数,传入checkpoint路径。默认为None。
181+
- 贴士:断点续训请保持其他参数不变,额外增加`--resume_from_checkpoint checkpoint_dir`。权重等信息将在trainer中读取。
181182
- 注意: resume_from_checkpoint会读取模型权重,优化器权重,随机种子,并从上次训练的steps继续开始训练。你可以指定`--resume_only_model`只读取模型权重。
182183
- 🔥ddp_find_unused_parameters: 默认为None。
183184
- 🔥dataloader_num_workers: 默认为None,若是windows平台,则设置为0,否则设置为1。

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@
219219
|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|qwen3_moe|qwen3|transformers>=4.51|✘|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|
220220
|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
221221
|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
222+
|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|-|
222223
|[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
223224
|[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
224225
|[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|

docs/source_en/Instruction/Command-line-parameters.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ Other important parameters:
180180
- 🔥warmup_ratio: Default is 0.
181181
- save_on_each_node: Default is False. Should be considered in multi-node training.
182182
- save_only_model: Whether to save only the model weights without including optimizer state, random seed state, etc. Default is False.
183-
- 🔥resume_from_checkpoint: Parameter for resuming training from a checkpoint, pass the checkpoint path. Default is None. For resuming training from a checkpoint, keep other parameters unchanged and add `--resume_from_checkpoint checkpoint_dir` additionally.
183+
- 🔥resume_from_checkpoint: Parameter for resuming training from a checkpoint, pass the checkpoint path. Default is None.
184+
- Tip: For resuming training from a checkpoint, keep all other parameters unchanged and additionally include `--resume_from_checkpoint checkpoint_dir`. The weights and related information will be loaded in the trainer.
184185
- Note: `resume_from_checkpoint` will load the model weights, optimizer weights, and random seed, and continue training from the last trained steps. You can specify `--resume_only_model` to load only the model weights.
185186
- 🔥ddp_find_unused_parameters: Default is None.
186187
- 🔥dataloader_num_workers: Defaults to None. If the platform is Windows, it is set to 0; otherwise, it is set to 1.

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ The table below introduces the models integrated with ms-swift:
219219
|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|qwen3_moe|qwen3|transformers>=4.51|✘|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|
220220
|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
221221
|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
222+
|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|-|
222223
|[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
223224
|[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
224225
|[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|

swift/llm/argument/train_args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def __post_init__(self) -> None:
155155
'Please specify `--attn_impl flash_attn`.')
156156
if self.resume_from_checkpoint:
157157
self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True)
158+
# The non-resume_only_model will have its weights loaded in the trainer.
158159
if self.resume_only_model:
159160
if self.train_type == 'full':
160161
self.model = self.resume_from_checkpoint

swift/llm/infer/infer_engine/infer_engine.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,25 @@ def _get_stop_words(self, stop_words: List[Union[str, List[int], None]]) -> List
5858
stop.append(stop_word)
5959
return stop
6060

61+
def _get_stop_token_ids(self, stop_words: List[Union[str, List[int], None]]) -> List[int]:
62+
stop_token_ids: List[int] = []
63+
for stop_word in stop_words:
64+
if stop_word is None:
65+
continue
66+
if isinstance(stop_word, str):
67+
stop_word = self.tokenizer.encode(stop_word, add_special_tokens=False)
68+
if isinstance(stop_word, list):
69+
if len(stop_word) != 1:
70+
continue
71+
else:
72+
stop_token = stop_word[0]
73+
elif isinstance(stop_word, int):
74+
stop_token = stop_word
75+
assert isinstance(stop_token, int)
76+
if stop_token not in stop_token_ids:
77+
stop_token_ids.append(stop_token)
78+
return stop_token_ids
79+
6180
def async_iter_to_iter(self, async_iter, prog_bar, metrics) -> Iterator:
6281
queue = Queue()
6382

swift/llm/infer/infer_engine/lmdeploy_engine.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -139,25 +139,6 @@ def _load_generation_config(self):
139139
else:
140140
self.generation_config = LmdeployGenerationConfig()
141141

142-
def _get_stop_token_ids(self, stop_words: List[Union[str, List[int], None]]) -> List[int]:
143-
stop_token_ids: List[int] = []
144-
for stop_word in stop_words:
145-
if stop_word is None:
146-
continue
147-
if isinstance(stop_word, str):
148-
stop_word = self.tokenizer.encode(stop_word, add_special_tokens=False)
149-
if isinstance(stop_word, list):
150-
if len(stop_word) != 1:
151-
continue
152-
else:
153-
stop_token = stop_word[0]
154-
elif isinstance(stop_word, int):
155-
stop_token = stop_word
156-
assert isinstance(stop_token, int)
157-
if stop_token not in stop_token_ids:
158-
stop_token_ids.append(stop_token)
159-
return stop_token_ids
160-
161142
def _add_stop_words(self, generation_config: LmdeployGenerationConfig, request_config: RequestConfig,
162143
template_meta: TemplateMeta) -> None:
163144
stop_words = (request_config.stop or []) + (self.generation_config.stop_words or []) + template_meta.stop_words

swift/llm/infer/infer_engine/vllm_engine.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,8 @@ def _add_stop_words(self, generation_config: SamplingParams, request_config: Req
232232
template_meta: TemplateMeta) -> None:
233233
stop_words = (request_config.stop or []) + (self.generation_config.stop or []) + template_meta.stop_words
234234
generation_config.stop = self._get_stop_words(stop_words)
235+
# stop parameter is not effective in v1 engine (test version: vllm 0.8.5.post)
236+
generation_config.stop_token_ids = self._get_stop_token_ids(stop_words)
235237

236238
@staticmethod
237239
def _version_ge(base_version: str):

swift/llm/model/model/qwen.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,8 @@ def _get_cast_dtype(self) -> torch.dtype:
577577
ModelGroup([
578578
Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'),
579579
Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'),
580+
# awq
581+
Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ')
580582
]),
581583
],
582584
TemplateType.qwen3_thinking,

swift/megatron/utils/convert.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from megatron.training.initialize import initialize_megatron
1212
from megatron.training.utils import get_ltor_masks_and_position_ids
1313

14-
from swift.llm import ExportArguments, HfConfigFactory, get_model_tokenizer, get_template, save_checkpoint
14+
from swift.llm import ExportArguments, HfConfigFactory, prepare_model_template, save_checkpoint, to_device
1515
from swift.utils import get_logger, get_n_params_grads
1616
from ..argument import MegatronArguments
1717
from ..model import get_megatron_model_meta
@@ -82,11 +82,10 @@ def _to_cpu_hook(module, args, output):
8282
hook.remove()
8383

8484

85-
def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.float32):
85+
def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float32):
8686
_test_params_sum(hf_model)
8787
_test_params_sum(mg_model)
8888

89-
template = get_template(hf_model.model_meta.template, processor)
9089
input_ids = template.encode({'messages': [{'role': 'user', 'content': 'who are you?'}]})['input_ids']
9190
input_ids = torch.tensor(input_ids)[None].to('cuda')
9291

@@ -145,8 +144,8 @@ def _check_megatron_kwargs(kwargs):
145144

146145

147146
def convert_hf2mcore(args: ExportArguments) -> None:
148-
kwargs = args.get_model_kwargs()
149-
hf_model, processor = get_model_tokenizer(**kwargs)
147+
hf_model, template = prepare_model_template(args)
148+
processor = template.processor
150149
if args.thread_count is None:
151150
checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
152151
args.thread_count = max(math.ceil(checkpoint_size / 10), 2) # 10GB
@@ -167,16 +166,16 @@ def convert_hf2mcore(args: ExportArguments) -> None:
167166
logger.info('Megatron model created successfully.')
168167
megatron_model_meta.convert_hf2mcore(hf_model, mg_model)
169168
if args.test_convert_precision:
170-
test_convert_precision(hf_model, mg_model, processor)
169+
test_convert_precision(hf_model, mg_model, template)
171170
logger.info('Successfully transferred HF model weights to MG model.')
172171
mg_save_checkpoint(1, [mg_model], None, None, 0)
173172
args.save_args()
174173
logger.info(f'Successfully saved Megatron model weights in `{args.output_dir}`.')
175174

176175

177176
def convert_mcore2hf(args: ExportArguments) -> None:
178-
kwargs = args.get_model_kwargs()
179-
hf_model, processor = get_model_tokenizer(**kwargs)
177+
hf_model, template = prepare_model_template(args)
178+
processor = template.processor
180179
if args.thread_count is None:
181180
checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
182181
args.thread_count = max(math.ceil(checkpoint_size / 10), 2) # 10GB
@@ -198,7 +197,7 @@ def convert_mcore2hf(args: ExportArguments) -> None:
198197
logger.info('Megatron model created successfully.')
199198
megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
200199
if args.test_convert_precision:
201-
test_convert_precision(hf_model, mg_model, processor)
200+
test_convert_precision(hf_model, mg_model, template)
202201
logger.info('Successfully transferred MG model weights to HF model.')
203202
save_checkpoint(
204203
hf_model,

0 commit comments

Comments
 (0)