Skip to content

Commit 4cb480e

Browse files
committed
update
1 parent 55a75c4 commit 4cb480e

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

swift/megatron/arguments/megatron_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,8 @@ def __post_init__(self):
731731
os.environ.setdefault('CUDA_DEVICE_MAX_CONNECTIONS', '1')
732732
if self.recompute_granularity == 'none':
733733
self.recompute_granularity = None
734+
if self.apply_wd_to_qk_layernorm and args.hf_model_type != 'qwen3_next':
735+
raise ValueError('apply_wd_to_qk_layernorm is only supported for qwen3_next')
734736
self._set_default()
735737
self.model_info, self.model_meta = get_model_info_meta(
736738
self.model, model_type=self.model_type, use_hf=self.use_hf, hub_token=self.hub_token)

swift/megatron/trainers/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,8 +337,8 @@ def _get_param_groups(
337337

338338
if no_weight_decay_cond is not None:
339339
no_wd: bool = no_weight_decay_cond(name, param)
340-
elif args.apply_wd_to_qk_layernorm and any(name.endswith(k) for k in ['q_layernorm.weight', 'k_layernorm.weight']):
341-
# assert args.hf_model_type == 'qwen3_next', 'currently only support qwen3_next'
340+
elif args.apply_wd_to_qk_layernorm and any(
341+
name.endswith(k) for k in ['q_layernorm.weight', 'k_layernorm.weight']):
342342
no_wd = False
343343
else:
344344
# Do not regularize biases and norm parameters.

0 commit comments

Comments
 (0)