Skip to content

finetune是否支持50系显卡 #1543

@Havedream6

Description

@Havedream6

nvidia-5090
cuda12.8
python3.11.7
torch2.7.1+cuda128

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2506, in _run_ninja_build
[rank0]: subprocess.run(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/subprocess.py", line 571, in run
[rank0]: raise CalledProcessError(retcode, process.args,
[rank0]: subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

[rank0]: The above exception was the direct cause of the following exception:

[rank0]: Traceback (most recent call last):
[rank0]: File "", line 198, in run_module_as_main
[rank0]: File "", line 88, in run_code
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/finetune/embedder/encoder_only/m3/main.py", line 27, in
[rank0]: main()
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/finetune/embedder/encoder_only/m3/main.py", line 23, in main
[rank0]: runner.run()
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/FlagEmbedding/abc/finetune/embedder/AbsRunner.py", line 149, in run
[rank0]: self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/transformers/trainer.py", line 2325, in train
[rank0]: return inner_training_loop(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/transformers/trainer.py", line 2483, in inner_training_loop
[rank0]: model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/accelerate/accelerator.py", line 1551, in prepare
[rank0]: result = self.prepare_deepspeed(*args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/accelerate/accelerator.py", line 2296, in prepare_deepspeed
[rank0]: engine, optimizer, , lr_scheduler = ds_initialize(**kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/init.py", line 203, in initialize
[rank0]: engine = DeepSpeedEngine(args=args,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 355, in init
[rank0]: self.configure_optimizer(optimizer, model_parameters)
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1445, in configure_optimizer
[rank0]: basic_optimizer = self.configure_basic_optimizer(model_parameters)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1528, in configure_basic_optimizer
[rank0]: optimizer = FusedAdam(
[rank0]: ^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in init
[rank0]: fused_adam_cuda = FusedAdamBuilder().load()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 542, in load
[rank0]: return self.jit_load(verbose)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 591, in jit_load
[rank0]: op_module = load(name=self.name,
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1623, in load
[rank0]: return jit_compile(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2076, in jit_compile
[rank0]: write_ninja_file_and_build_library(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2222, in write_ninja_file_and_build_library
[rank0]: run_ninja_build(
[rank0]: File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2522, in run_ninja_build
[rank0]: raise RuntimeError(message) from e
[rank0]: RuntimeError: Error building extension 'fused_adam': [1/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output multi_tensor_adam.cuda.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS -D__CUDA_NO_HALF_CONVERSIONS
-D__CUDA_NO_BFLOAT16_CONVERSIONS
-D__CUDA_NO_HALF2_OPERATORS
--expt-relaxed-constexpr -gencode=arch=compute_120,code=compute_120 -gencode=arch=compute_120,code=sm_120 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120 -UC10_USE_GLOG -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS
-U__CUDA_NO_BFLOAT162_OPERATORS
-U__CUDA_NO_BFLOAT16_CONVERSIONS
-std=c++17 -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
[rank0]: FAILED: [code=1] multi_tensor_adam.cuda.o
[rank0]: /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output multi_tensor_adam.cuda.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="gcc" -DPYBIND11_STDLIB="libstdcpp" -DPYBIND11_BUILD_ABI="cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS -D__CUDA_NO_HALF_CONVERSIONS
-D__CUDA_NO_BFLOAT16_CONVERSIONS
-D__CUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_120,code=compute_120 -gencode=arch=compute_120,code=sm_120 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120 -UC10_USE_GLOG -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
[rank0]: nvcc fatal : Unsupported gpu architecture 'compute_120'
[rank0]: [2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE="_gcc" -DPYBIND11_STDLIB="_libstdcpp" -DPYBIND11_BUILD_ABI="_cxxabi1016" -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include -isystem /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -isystem /data/miniconda3/envs/emb-ft/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -UC10_USE_GLOG -DBF16_AVAILABLE -c /data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
[rank0]: ninja: build stopped: subcommand failed.

[rank0]:[W1104 15:58:51.171196633 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
E1104 15:58:54.409000 2899866 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 2899981) of binary: /data/miniconda3/envs/emb-ft/bin/python3.11
Traceback (most recent call last):
File "/data/miniconda3/envs/emb-ft/bin/torchrun", line 7, in
sys.exit(main())
^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
run(args)
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
elastic_launch(
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miniconda3/envs/emb-ft/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

FlagEmbedding.finetune.embedder.encoder_only.m3 FAILED

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions