Checklist / 检查清单
Bug Description / Bug 描述
[rank1]: Traceback (most recent call last):
[rank1]: File "/data/ms-swift/swift/cli/_megatron/sft.py", line 7, in
[rank1]: megatron_sft_main()
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 91, in megatron_sft_main
[rank1]: return MegatronSft(args).main()
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/pipelines/base.py", line 52, in main
[rank1]: result = self.run()
[rank1]: ^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 66, in run
[rank1]: trainer = self.prepare_trainer()
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 34, in prepare_trainer
[rank1]: return MegatronTrainer(self.args, self.template)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 68, in init
[rank1]: self.prepare_model()
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 191, in prepare_model
[rank1]: self.peft_models = self._prepare_peft_model(self.unwrapped_models)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 197, in _prepare_peft_model
[rank1]: self.bridge.load_weights(models, args.model_dir)
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1688, in load_weights
[rank1]: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: '))
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1593, in _convert
[rank1]: res = self._set_layer_state(mg_layer, hf_state_dict, f'{self.hf_layers_prefix}.', layer_idx, to_mcore)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1487, in _set_layer_state
[rank1]: hf_state_dict.update(self._set_layer_attn(mg_layer, hf_state_dict, layer_idx, to_mcore))
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/mm_gpts/qwen3_5_gdn.py", line 22, in _set_layer_attn
[rank1]: self._set_linear_attn_state(mg_attn, hf_state_dict, 'linear_attn.', layer_idx, to_mcore))
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1316, in _set_linear_attn_state
[rank1]: in_proj_weight = torch.cat([
[rank1]: ^^^^^^^^^^^
[rank1]: RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and BFloat16
How to Reproduce / 如何复现
swift版本
swift.version: 4.1.0.dev0
训练shell脚本
OMP_NUM_THREADS=14 \
IMAGE_MAX_TOKEN_NUM=1024 \
SWIFT_USE_MCORE_GDN=1 \
SKIP_MULTIMODAL_MTP_VALIDATION=1 \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NNODES=$nnodes \
NODE_RANK=$INDEX \
MASTER_ADDR=$CHIEF_IP \
MASTER_PORT=29500 \
NPROC_PER_NODE=$nproc_per_node \
megatron sft \
--model Qwen/Qwen3___5-397B-A17B-FP8 \
--save_safetensors true \
--dataset $DATA \
--load_from_cache_file true \
--fp8_recipe blockwise \
--fp8_format e4m3 \
--fp8_param_gather false \
--split_dataset_ratio 0 \
--moe_permute_fusion true \
--tensor_model_parallel_size 8 \
--pipeline-model-parallel-size 4 \
--expert_model_parallel_size 8 \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-6 \
--micro_batch_size 8 \
--global_batch_size 256 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--num_train_epochs 3 \
--finetune true \
--cross_entropy_loss_fusion true \
--add_non_thinking_prefix true \
--loss_scale ignore_empty_think \
--tuner_type full \
--lr 5e-6 \
--lr_warmup_fraction 0.05 \
--output_dir $OUTPUT_PATH \
--save_strategy epoch \
--max_length 8192 \
--dataset_num_proc 64 \
--no_save_optim true \
--no_save_rng true \
--sequence_parallel true \
--attention_backend flash \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--use-distributed-optimizer \
--optimizer_cpu_offload true \
--use_precision_aware_optimizer true \
--packing true \
--padding_free true
Additional Information / 补充信息
cuda 12.9.1
transformers 5.3.0
transformer_engine 2.12.0
flash_attn 2.8.3
megatron 0.16.1
Checklist / 检查清单
Bug Description / Bug 描述
[rank1]: Traceback (most recent call last):
[rank1]: File "/data/ms-swift/swift/cli/_megatron/sft.py", line 7, in
[rank1]: megatron_sft_main()
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 91, in megatron_sft_main
[rank1]: return MegatronSft(args).main()
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/pipelines/base.py", line 52, in main
[rank1]: result = self.run()
[rank1]: ^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 66, in run
[rank1]: trainer = self.prepare_trainer()
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/pipelines/train/sft.py", line 34, in prepare_trainer
[rank1]: return MegatronTrainer(self.args, self.template)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 68, in init
[rank1]: self.prepare_model()
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 191, in prepare_model
[rank1]: self.peft_models = self._prepare_peft_model(self.unwrapped_models)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/trainers/base.py", line 197, in _prepare_peft_model
[rank1]: self.bridge.load_weights(models, args.model_dir)
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1688, in load_weights
[rank1]: list(self._convert([mg_model], state_dict, hf_prefix, True, 'Loading: '))
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1593, in _convert
[rank1]: res = self._set_layer_state(mg_layer, hf_state_dict, f'{self.hf_layers_prefix}.', layer_idx, to_mcore)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1487, in _set_layer_state
[rank1]: hf_state_dict.update(self._set_layer_attn(mg_layer, hf_state_dict, layer_idx, to_mcore))
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/mm_gpts/qwen3_5_gdn.py", line 22, in _set_layer_attn
[rank1]: self._set_linear_attn_state(mg_attn, hf_state_dict, 'linear_attn.', layer_idx, to_mcore))
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/ms-swift/swift/megatron/model/gpt_bridge.py", line 1316, in _set_linear_attn_state
[rank1]: in_proj_weight = torch.cat([
[rank1]: ^^^^^^^^^^^
[rank1]: RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and BFloat16
How to Reproduce / 如何复现
swift版本
swift.version: 4.1.0.dev0
训练shell脚本
Additional Information / 补充信息
cuda 12.9.1
transformers 5.3.0
transformer_engine 2.12.0
flash_attn 2.8.3
megatron 0.16.1