-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
ImportError: deepspeed>=0.9.3 is required for a normal functioning of this module, but found deepspeed==0.8.3.
Cluster: g5.8xlarge (A10 with 128GB CPU RAM)
DBR = 13.2 ML
Full error trace:
2023-08-16 16:27:36 ERROR [main] main failed
Traceback (most recent call last):
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 332, in
main()
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 324, in main
train(**kwargs)
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 241, in train
training_args = TrainingArguments(
File "", line 112, in init
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/training_args.py", line 1607, in post_init
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/deepspeed.py", line 76, in init
super().init(config_file_or_dict)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/deepspeed.py", line 65, in init
dep_version_check("deepspeed")
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/dependency_versions_check.py", line 63, in dep_version_check
require_version(deps[pkg], hint)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/utils/versions.py", line 111, in require_version
_compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
raise ImportError(
ImportError: deepspeed>=0.9.3 is required for a normal functioning of this module, but found deepspeed==0.8.3.
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 332, in
main()
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/databricks/python/lib/python3.10/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 324, in main
train(**kwargs)
File "/Workspace/Repos/[email protected]/dolly/training/trainer.py", line 241, in train
training_args = TrainingArguments(
File "", line 112, in init
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/training_args.py", line 1607, in post_init
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/deepspeed.py", line 76, in init
super().init(config_file_or_dict)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/deepspeed.py", line 65, in init
dep_version_check("deepspeed")
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/dependency_versions_check.py", line 63, in dep_version_check
require_version(deps[pkg], hint)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/utils/versions.py", line 111, in require_version
_compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
raise ImportError(
ImportError: deepspeed>=0.9.3 is required for a normal functioning of this module, but found deepspeed==0.8.3.
[2023-08-16 16:27:39,388] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 2407
[2023-08-16 16:27:39,388] [ERROR] [launch.py:324:sigkill_handler] ['/local_disk0/.ephemeral_nfs/envs/pythonEnv-6642b3dd-3203-413e-aede-795ea4fff0e2/bin/python', '-u', '-m', 'training.trainer', '--local_rank=0', '--input-model', 'databricks/dolly-v2-7b', '--deepspeed', '/Workspace/Repos/[email protected]/dolly/config/a10_config.json', '--epochs', '2', '--local-output-dir', '/local_disk0/dolly_training/dolly__4087989587289246__2023-08-16T16:25:30', '--dbfs-output-dir', '/dbfs/dolly_training/dolly__4087989587289246__2023-08-16T16:25:30', '--per-device-train-batch-size', '4', '--per-device-eval-batch-size', '4', '--logging-steps', '10', '--save-steps', '200', '--save-total-limit', '20', '--eval-steps', '50', '--warmup-steps', '50', '--test-size', '200', '--lr', '5e-6', '--bf16', 'true'] exits with return code = 1