Skip to content

Megatron SFT: ValueError: vector::reserve #9047

@chapter544

Description

@chapter544

Checklist / 检查清单

  • I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues,确认这是一个新的 bug report。

Bug Description / Bug 描述

[rank0]: Traceback (most recent call last):
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/cli/_megatron/sft.py", line 7, in
[rank0]: megatron_sft_main()
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 91, in megatron_sft_main
[rank0]: return MegatronSft(args).main()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/pipelines/base.py", line 52, in main
[rank0]: result = self.run()
[rank0]: ^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/pipelines/train/sft.py", line 68, in run
[rank0]: trainer.train(train_dataset, val_dataset)
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 637, in train
[rank0]: metrics, grad_norm, update_successful = self.train_step(train_data_iterator)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/base.py", line 858, in train_step
[rank0]: metrics = forward_backward_func(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 636, in forward_backward_no_pipelining
[rank0]: output_tensor, num_tokens = forward_step(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/pipeline_parallel/schedules.py", line 423, in forward_step
[rank0]: output_tensor, loss_func = forward_step_func(data_iterator, model)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/agent/ms-swift-v4/ms-swift/swift/megatron/trainers/trainer.py", line 124, in forward_step
[rank0]: output_tensor = model(**data)
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/distributed/data_parallel_base.py", line 22, in forward
[rank0]: return self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 489, in forward
[rank0]: outputs = self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/model/gpt_model.py", line 326, in forward
[rank0]: hidden_states = self.decoder(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 619, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 736, in forward
[rank0]: hidden_states = self._checkpointed_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 538, in _checkpointed_forward
[rank0]: hidden_states, context = checkpoint_handler(
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 521, in checkpoint_handler
[rank0]: return tensor_parallel.checkpoint(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 576, in checkpoint
[rank0]: return CheckpointFunction.apply(function, distribute_saved_activations, *args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/tensor_parallel/random.py", line 517, in forward
[rank0]: outputs = run_function(*args)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 489, in custom_forward
[rank0]: hidden_states, context = layer(
[rank0]: ^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 1217, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 352, in call
[rank0]: return super().call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/mcore_bridge/patcher.py", line 524, in forward
[rank0]: hidden_states, context = self._forward_attention(
_args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 597, in _forward_attention
[rank0]: attention_output_with_bias = self.self_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/transformer/attention.py", line 1150, in forward
[rank0]: core_attn_out = apply_module(self.core_attention)(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/megatron/core/extensions/transformer_engine.py", line 1411, in forward
[rank0]: core_attn_out = super().forward(query, key, value, attention_mask, **_fa_kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/jit.py", line 67, in wrapper
[rank0]: return disabled_f(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_dynamo/external_utils.py", line 203, in nonrecursive_disable_wrapper
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py", line 1449, in forward
[rank0]: return self.flash_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/transformer_engine/pytorch/attention/dot_product_attention/backends.py", line 1004, in forward
[rank0]: output = func(
[rank0]: ^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 1443, in flash_attn_varlen_func
[rank0]: return FlashAttnVarlenFunc.apply(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 925, in forward
[rank0]: out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/ms-swift/lib/python3.12/site-packages/torch/_ops.py", line 1209, in call
[rank0]: return self._op(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: ValueError: vector::reserve

How to Reproduce / 如何复现

PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
NPROC_PER_NODE=2
CUDA_VISIBLE_DEVICES=4,5
megatron sft
--model Qwen/Qwen3-4B-Instruct-2507
--use_hf true
--save_safetensors true
--packing true
--dataset "dev_nemotron_if_samples.jsonl"
--tuner_type lora
--lora_rank 8
--lora_alpha 32
--target_modules all-linear
--tensor_model_parallel_size 2
--sequence_parallel true
--micro_batch_size 1
--global_batch_size 16
--recompute_granularity full
--recompute_method uniform
--recompute_num_layers 1
--lora_rank 8
--lora_alpha 32
--tuner_type 'lora'
--cross_entropy_loss_fusion true
--lr 1e-5
--lr_warmup_fraction 0.05
--min_lr 1e-6
--num_train_epochs 1
--output_dir megatron_output/Qwen3-4B-Instruct
--save_steps 100
--max_length 2048
--system 'You are a helpful assistant.'
--dataloader_num_workers 4
--no_save_optim true
--no_save_rng true
--dataset_num_proc 4
--model_author swift
--model_name swift-robot

Additional Information / 补充信息

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions