Checklist / 检查清单
Bug Description / Bug 描述
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/cli/_megatron/sft.py", line 7, in
[rank0]: megatron_sft_main()
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 91, in megatron_sft_main
[rank0]: return MegatronSft(args).main()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/pipelines/base.py", line 52, in main
[rank0]: result = self.run()
[rank0]: ^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 68, in run
[rank0]: trainer.train(train_dataset, val_dataset)
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 637, in train
[rank0]: metrics, grad_norm, update_successful = self.train_step(train_data_iterator)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 858, in train_step
[rank0]: metrics = forward_backward_func(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 636, in forward_backward_no_pipelining
[rank0]: output_tensor, num_tokens = forward_step(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 423, in forward_step
[rank0]: output_tensor, loss_func = forward_step_func(data_iterator, model)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/trainer.py", line 124, in forward_step
[rank0]: output_tensor = model(**data)
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/distributed/data_parallel_base.py", line 22, in forward
[rank0]: return self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 489, in forward
[rank0]: outputs = self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/model/gpt_model.py", line 326, in forward
[rank0]: hidden_states = self.decoder(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 619, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 736, in forward
[rank0]: hidden_states = self._checkpointed_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 538, in _checkpointed_forward
[rank0]: hidden_states, context = checkpoint_handler(
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 521, in checkpoint_handler
[rank0]: return tensor_parallel.checkpoint(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 576, in checkpoint
[rank0]: return CheckpointFunction.apply(function, distribute_saved_activations, *args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 517, in forward
[rank0]: outputs = run_function(*args)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 489, in custom_forward
[rank0]: hidden_states, context = layer(
[rank0]: ^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 1217, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/patcher.py", line 524, in forward
[rank0]: hidden_states, context = self._forward_attention(_args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 597, in _forward_attention
[rank0]: attention_output_with_bias = self.self_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/attention.py", line 1150, in forward
[rank0]: core_attn_out = apply_module(self.core_attention)(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/extensions/transformer_engine.py", line 1411, in forward
[rank0]: core_attn_out = super().forward(query, key, value, attention_mask, **_fa_kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/jit.py", line 67, in wrapper
[rank0]: return disabled_f(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_dynamo/external_utils.py", line 203, in nonrecursive_disable_wrapper
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py", line 1449, in forward
[rank0]: return self.flash_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/backends.py", line 1004, in forward
[rank0]: output = func(
[rank0]: ^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
[rank0]: return FlashAttnVarlenFunc.apply(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 925, in forward
[rank0]: out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_ops.py", line 1209, in call
[rank0]: return self._op(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ValueError: vector::reserve
How to Reproduce / 如何复现
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
NPROC_PER_NODE=2
CUDA_VISIBLE_DEVICES=4,5
megatron sft
--model Qwen/Qwen3-4B-Instruct-2507
--use_hf true
--save_safetensors true
--packing true
--dataset "dev_nemotron_if_samples.jsonl"
--tuner_type lora
--lora_rank 8
--lora_alpha 32
--target_modules all-linear
--tensor_model_parallel_size 2
--sequence_parallel true
--micro_batch_size 1
--global_batch_size 16
--recompute_granularity full
--recompute_method uniform
--recompute_num_layers 1
--lora_rank 8
--lora_alpha 32
--tuner_type 'lora'
--cross_entropy_loss_fusion true
--lr 1e-5
--lr_warmup_fraction 0.05
--min_lr 1e-6
--num_train_epochs 1
--output_dir megatron_output/Qwen3-4B-Instruct
--save_steps 100
--max_length 2048
--system 'You are a helpful assistant.'
--dataloader_num_workers 4
--no_save_optim true
--no_save_rng true
--dataset_num_proc 4
--model_author swift
--model_name swift-robot
Additional Information / 补充信息
No response
Checklist / 检查清单
Bug Description / Bug 描述
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/cli/_megatron/sft.py", line 7, in
[rank0]: megatron_sft_main()
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 91, in megatron_sft_main
[rank0]: return MegatronSft(args).main()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/pipelines/base.py", line 52, in main
[rank0]: result = self.run()
[rank0]: ^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 68, in run
[rank0]: trainer.train(train_dataset, val_dataset)
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 637, in train
[rank0]: metrics, grad_norm, update_successful = self.train_step(train_data_iterator)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 858, in train_step
[rank0]: metrics = forward_backward_func(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 636, in forward_backward_no_pipelining
[rank0]: output_tensor, num_tokens = forward_step(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 423, in forward_step
[rank0]: output_tensor, loss_func = forward_step_func(data_iterator, model)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/trainer.py", line 124, in forward_step
[rank0]: output_tensor = model(**data)
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/distributed/data_parallel_base.py", line 22, in forward
[rank0]: return self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 489, in forward
[rank0]: outputs = self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/model/gpt_model.py", line 326, in forward
[rank0]: hidden_states = self.decoder(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 619, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 736, in forward
[rank0]: hidden_states = self._checkpointed_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 538, in _checkpointed_forward
[rank0]: hidden_states, context = checkpoint_handler(
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 521, in checkpoint_handler
[rank0]: return tensor_parallel.checkpoint(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 576, in checkpoint
[rank0]: return CheckpointFunction.apply(function, distribute_saved_activations, *args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 517, in forward
[rank0]: outputs = run_function(*args)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 489, in custom_forward
[rank0]: hidden_states, context = layer(
[rank0]: ^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 1217, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/patcher.py", line 524, in forward
[rank0]: hidden_states, context = self._forward_attention(_args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 597, in _forward_attention
[rank0]: attention_output_with_bias = self.self_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/attention.py", line 1150, in forward
[rank0]: core_attn_out = apply_module(self.core_attention)(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/extensions/transformer_engine.py", line 1411, in forward
[rank0]: core_attn_out = super().forward(query, key, value, attention_mask, **_fa_kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/jit.py", line 67, in wrapper
[rank0]: return disabled_f(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_dynamo/external_utils.py", line 203, in nonrecursive_disable_wrapper
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py", line 1449, in forward
[rank0]: return self.flash_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/backends.py", line 1004, in forward
[rank0]: output = func(
[rank0]: ^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
[rank0]: return FlashAttnVarlenFunc.apply(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 925, in forward
[rank0]: out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_ops.py", line 1209, in call
[rank0]: return self._op(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ValueError: vector::reserve
How to Reproduce / 如何复现
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
NPROC_PER_NODE=2
CUDA_VISIBLE_DEVICES=4,5
megatron sft
--model Qwen/Qwen3-4B-Instruct-2507
--use_hf true
--save_safetensors true
--packing true
--dataset "dev_nemotron_if_samples.jsonl"
--tuner_type lora
--lora_rank 8
--lora_alpha 32
--target_modules all-linear
--tensor_model_parallel_size 2
--sequence_parallel true
--micro_batch_size 1
--global_batch_size 16
--recompute_granularity full
--recompute_method uniform
--recompute_num_layers 1
--lora_rank 8
--lora_alpha 32
--tuner_type 'lora'
--cross_entropy_loss_fusion true
--lr 1e-5
--lr_warmup_fraction 0.05
--min_lr 1e-6
--num_train_epochs 1
--output_dir megatron_output/Qwen3-4B-Instruct
--save_steps 100
--max_length 2048
--system 'You are a helpful assistant.'
--dataloader_num_workers 4
--no_save_optim true
--no_save_rng true
--dataset_num_proc 4
--model_author swift
--model_name swift-robot
Additional Information / 补充信息
No response