-
Notifications
You must be signed in to change notification settings - Fork 42
Description
你好 作者,我在做sft训练的时候遇到了如下错误,请问该如何解决呢
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/sig95vg/wangcy/VLA/DeepThinkVLA/src/train.py", line 236, in
[rank0]: train(model_args, data_args, training_args)
[rank0]: File "/home/sig95vg/wangcy/VLA/DeepThinkVLA/src/train.py", line 201, in train
[rank0]: experiment.train()
[rank0]: File "/home/sig95vg/wangcy/VLA/DeepThinkVLA/src/sft/sft_runner.py", line 130, in train
[rank0]: self.trainer.train(resume_from_checkpoint=self.resume_from_checkpoint)
[rank0]: File "/home/sig95vg/wangcy/VLA/DeepThinkVLA/src/sft/sft_trainer.py", line 276, in train
[rank0]: return super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/transformers/trainer.py", line 2171, in train
[rank0]: return inner_training_loop(
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/transformers/trainer.py", line 2531, in _inner_training_loop
[rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/transformers/trainer.py", line 3712, in training_step
[rank0]: self.accelerator.backward(loss, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2844, in backward
[rank0]: self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 270, in backward
[rank0]: self.engine.backward(loss, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
[rank0]: ret_val = func(*args, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2492, in backward
[rank0]: loss.backward(**backward_kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/torch/_tensor.py", line 521, in backward
[rank0]: torch.autograd.backward(
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/torch/autograd/init.py", line 289, in backward
[rank0]: _engine_run_backward(
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/torch/autograd/graph.py", line 768, in _engine_run_backward
[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
[rank0]: ret_val = func(*args, **kwargs)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 1277, in reduce_partition_and_remove_grads
[rank0]: self._remaining_grad_acc_hooks = count_used_parameters_in_backward(
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/deepspeed/runtime/utils.py", line 1419, in count_used_parameters_in_backward
[rank0]: grad_fn = _get_grad_fn_or_grad_acc(param)
[rank0]: File "/home/sig95vg/miniconda3/envs/deepthinkvla/lib/python3.10/site-packages/torch/autograd/graph.py", line 161, in _get_grad_fn_or_grad_acc
[rank0]: return t.view_as(t).grad_fn.next_functions[0][0]
[rank0]: AttributeError: 'NoneType' object has no attribute 'next_functions'