Fix bug (#5121)

Jintao-Huang · Jintao-Huang · commit 7af5b614bbe5 · 2025-07-28T15:25:11.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -177,7 +177,8 @@
 - 🔥warmup_ratio: 默认为0.。
 - save_on_each_node: 默认为False。在多机训练时需要被考虑。
 - save_only_model: 是否只保存模型权重而不包含优化器状态，随机种子状态等内容。默认为False。
-- 🔥resume_from_checkpoint: 断点续训参数，传入checkpoint路径。默认为None。断点续训请保持其他参数不变，额外增加`--resume_from_checkpoint checkpoint_dir`。
+- 🔥resume_from_checkpoint: 断点续训参数，传入checkpoint路径。默认为None。
+  - 贴士：断点续训请保持其他参数不变，额外增加`--resume_from_checkpoint checkpoint_dir`。权重等信息将在trainer中读取。
   - 注意: resume_from_checkpoint会读取模型权重，优化器权重，随机种子，并从上次训练的steps继续开始训练。你可以指定`--resume_only_model`只读取模型权重。
 - 🔥ddp_find_unused_parameters: 默认为None。
 - 🔥dataloader_num_workers: 默认为None，若是windows平台，则设置为0，否则设置为1。
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -219,6 +219,7 @@
 |[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|qwen3_moe|qwen3|transformers>=4.51|&#x2718;|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|
 |[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
 |[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
+|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|-|
 |[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
 |[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
 |[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -180,7 +180,8 @@ Other important parameters:
 - 🔥warmup_ratio: Default is 0.
 - save_on_each_node: Default is False. Should be considered in multi-node training.
 - save_only_model: Whether to save only the model weights without including optimizer state, random seed state, etc. Default is False.
-- 🔥resume_from_checkpoint: Parameter for resuming training from a checkpoint, pass the checkpoint path. Default is None. For resuming training from a checkpoint, keep other parameters unchanged and add `--resume_from_checkpoint checkpoint_dir` additionally.
+- 🔥resume_from_checkpoint: Parameter for resuming training from a checkpoint, pass the checkpoint path. Default is None.
+  - Tip: For resuming training from a checkpoint, keep all other parameters unchanged and additionally include `--resume_from_checkpoint checkpoint_dir`. The weights and related information will be loaded in the trainer.
   - Note: `resume_from_checkpoint` will load the model weights, optimizer weights, and random seed, and continue training from the last trained steps. You can specify `--resume_only_model` to load only the model weights.
 - 🔥ddp_find_unused_parameters: Default is None.
 - 🔥dataloader_num_workers: Defaults to None. If the platform is Windows, it is set to 0; otherwise, it is set to 1.
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -219,6 +219,7 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|qwen3_moe|qwen3|transformers>=4.51|&#x2718;|coding|[Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8)|
 |[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2714;|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
 |[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
+|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|&#x2718;|-|-|
 |[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
 |[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
 |[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|&#x2718;|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|
diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -155,6 +155,7 @@ def __post_init__(self) -> None:
                                  'Please specify `--attn_impl flash_attn`.')
         if self.resume_from_checkpoint:
             self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True)
+            # The non-resume_only_model will have its weights loaded in the trainer.
             if self.resume_only_model:
                 if self.train_type == 'full':
                     self.model = self.resume_from_checkpoint
diff --git a/swift/llm/infer/infer_engine/infer_engine.py b/swift/llm/infer/infer_engine/infer_engine.py
@@ -58,6 +58,25 @@ def _get_stop_words(self, stop_words: List[Union[str, List[int], None]]) -> List
                 stop.append(stop_word)
         return stop
 
+    def _get_stop_token_ids(self, stop_words: List[Union[str, List[int], None]]) -> List[int]:
+        stop_token_ids: List[int] = []
+        for stop_word in stop_words:
+            if stop_word is None:
+                continue
+            if isinstance(stop_word, str):
+                stop_word = self.tokenizer.encode(stop_word, add_special_tokens=False)
+            if isinstance(stop_word, list):
+                if len(stop_word) != 1:
+                    continue
+                else:
+                    stop_token = stop_word[0]
+            elif isinstance(stop_word, int):
+                stop_token = stop_word
+            assert isinstance(stop_token, int)
+            if stop_token not in stop_token_ids:
+                stop_token_ids.append(stop_token)
+        return stop_token_ids
+
     def async_iter_to_iter(self, async_iter, prog_bar, metrics) -> Iterator:
         queue = Queue()
 
diff --git a/swift/llm/infer/infer_engine/lmdeploy_engine.py b/swift/llm/infer/infer_engine/lmdeploy_engine.py
@@ -139,25 +139,6 @@ def _load_generation_config(self):
         else:
             self.generation_config = LmdeployGenerationConfig()
 
-    def _get_stop_token_ids(self, stop_words: List[Union[str, List[int], None]]) -> List[int]:
-        stop_token_ids: List[int] = []
-        for stop_word in stop_words:
-            if stop_word is None:
-                continue
-            if isinstance(stop_word, str):
-                stop_word = self.tokenizer.encode(stop_word, add_special_tokens=False)
-            if isinstance(stop_word, list):
-                if len(stop_word) != 1:
-                    continue
-                else:
-                    stop_token = stop_word[0]
-            elif isinstance(stop_word, int):
-                stop_token = stop_word
-            assert isinstance(stop_token, int)
-            if stop_token not in stop_token_ids:
-                stop_token_ids.append(stop_token)
-        return stop_token_ids
-
     def _add_stop_words(self, generation_config: LmdeployGenerationConfig, request_config: RequestConfig,
                         template_meta: TemplateMeta) -> None:
         stop_words = (request_config.stop or []) + (self.generation_config.stop_words or []) + template_meta.stop_words
diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
@@ -232,6 +232,8 @@ def _add_stop_words(self, generation_config: SamplingParams, request_config: Req
                         template_meta: TemplateMeta) -> None:
         stop_words = (request_config.stop or []) + (self.generation_config.stop or []) + template_meta.stop_words
         generation_config.stop = self._get_stop_words(stop_words)
+        # stop parameter is not effective in v1 engine (test version: vllm 0.8.5.post)
+        generation_config.stop_token_ids = self._get_stop_token_ids(stop_words)
 
     @staticmethod
     def _version_ge(base_version: str):
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -577,6 +577,8 @@ def _get_cast_dtype(self) -> torch.dtype:
             ModelGroup([
                 Model('Qwen/Qwen3-235B-A22B-Thinking-2507', 'Qwen/Qwen3-235B-A22B-Thinking-2507'),
                 Model('Qwen/Qwen3-235B-A22B-Thinking-2507-FP8', 'Qwen/Qwen3-235B-A22B-Thinking-2507-FP8'),
+                # awq
+                Model('swift/Qwen3-235B-A22B-Thinking-2507-AWQ')
             ]),
         ],
         TemplateType.qwen3_thinking,
diff --git a/swift/megatron/utils/convert.py b/swift/megatron/utils/convert.py
@@ -11,7 +11,7 @@
 from megatron.training.initialize import initialize_megatron
 from megatron.training.utils import get_ltor_masks_and_position_ids
 
-from swift.llm import ExportArguments, HfConfigFactory, get_model_tokenizer, get_template, save_checkpoint
+from swift.llm import ExportArguments, HfConfigFactory, prepare_model_template, save_checkpoint, to_device
 from swift.utils import get_logger, get_n_params_grads
 from ..argument import MegatronArguments
 from ..model import get_megatron_model_meta
@@ -82,11 +82,10 @@ def _to_cpu_hook(module, args, output):
             hook.remove()
 
 
-def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.float32):
+def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float32):
     _test_params_sum(hf_model)
     _test_params_sum(mg_model)
 
-    template = get_template(hf_model.model_meta.template, processor)
     input_ids = template.encode({'messages': [{'role': 'user', 'content': 'who are you?'}]})['input_ids']
     input_ids = torch.tensor(input_ids)[None].to('cuda')
 
@@ -145,8 +144,8 @@ def _check_megatron_kwargs(kwargs):
 
 
 def convert_hf2mcore(args: ExportArguments) -> None:
-    kwargs = args.get_model_kwargs()
-    hf_model, processor = get_model_tokenizer(**kwargs)
+    hf_model, template = prepare_model_template(args)
+    processor = template.processor
     if args.thread_count is None:
         checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
         args.thread_count = max(math.ceil(checkpoint_size / 10), 2)  # 10GB
@@ -167,16 +166,16 @@ def convert_hf2mcore(args: ExportArguments) -> None:
     logger.info('Megatron model created successfully.')
     megatron_model_meta.convert_hf2mcore(hf_model, mg_model)
     if args.test_convert_precision:
-        test_convert_precision(hf_model, mg_model, processor)
+        test_convert_precision(hf_model, mg_model, template)
     logger.info('Successfully transferred HF model weights to MG model.')
     mg_save_checkpoint(1, [mg_model], None, None, 0)
     args.save_args()
     logger.info(f'Successfully saved Megatron model weights in `{args.output_dir}`.')
 
 
 def convert_mcore2hf(args: ExportArguments) -> None:
-    kwargs = args.get_model_kwargs()
-    hf_model, processor = get_model_tokenizer(**kwargs)
+    hf_model, template = prepare_model_template(args)
+    processor = template.processor
     if args.thread_count is None:
         checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
         args.thread_count = max(math.ceil(checkpoint_size / 10), 2)  # 10GB
@@ -198,7 +197,7 @@ def convert_mcore2hf(args: ExportArguments) -> None:
     logger.info('Megatron model created successfully.')
     megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
     if args.test_convert_precision:
-        test_convert_precision(hf_model, mg_model, processor)
+        test_convert_precision(hf_model, mg_model, template)
     logger.info('Successfully transferred MG model weights to HF model.')
     save_checkpoint(
         hf_model,
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -123,7 +123,7 @@ def __init__(self,
                         reward_func_kwargs['tokenizer'] = self.processing_class
                     reward_funcs[i] = reward_func_class(**reward_func_kwargs)
                 elif not callable(reward_func):
-                    raise ValueError(f'reward_function {reward_func} is not implemented in swift.llm.plugin')
+                    raise ValueError(f'reward_function {reward_func} is not implemented in swift.plugin')
 
         self.reward_funcs = reward_funcs
         self.reward_func_names = []