Replies: 3 comments 8 replies
-
You can check https://github.com/deepmodeling/dpgen/discussions/785, it seems like you got similiar problem. |
Beta Was this translation helpful? Give feedback.
2 replies
-
Could you check the reason why slurm fails to run these tasks and provide more details? |
Beta Was this translation helpful? Give feedback.
5 replies
-
You may use |
Beta Was this translation helpful? Give feedback.
1 reply
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
It did not work at all and just stop at iter.000000 task 01
Traceback
Traceback (most recent call last):
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 241, in handle_unexpected_submission_state
job.handle_unexpected_job_state()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 605, in handle_unexpected_job_state
raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}")
RuntimeError: job:80b0713a4333c3d802c77c72166a0772e9f9b968 416884 failed 3 times.job_detail:{'80b0713a4333c3d802c77c72166a0772e9f9b968': {'job_task_list': [{'command': "/bin/sh -c '{ if [ ! -f model.ckpt.index ]; then dp train input.json; else dp train input.json --restart model.ckpt; fi }'&&dp freeze", 'task_work_path': '002', 'forward_files': ['input.json'], 'backward_files': ['frozen_model.pb', 'lcurve.out', 'train.log', 'model.ckpt.meta', 'model.ckpt.index', 'model.ckpt.data-00000-of-00001', 'checkpoint'], 'outlog': 'train.log', 'errlog': 'train.log'}], 'resources': {'number_node': 1, 'cpu_per_node': 1, 'gpu_per_node': 1, 'queue_name': 'gpu', 'group_size': 1, 'custom_flags': ['#SBATCH -o output.%j', '#SBATCH -e err -o out'], 'strategy': {'if_cuda_multi_devices': False, 'ratio_unfinished': 0.0}, 'para_deg': 1, 'module_purge': False, 'module_unload_list': [], 'module_list': [], 'source_list': ['~/WORKSPACE/zjl9/DPGEN/deepmd.sh'], 'envs': {}, 'wait_time': 0, 'kwargs': {}}, 'job_state': <JobStatus.terminated: 4>, 'job_id': '416884', 'fail_count': 3}}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/share/home/xlzou/.local/bin/dpgen", line 8, in
sys.exit(main())
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/main.py", line 185, in main
args.func(args)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 3642, in gen_run
run_iter (args.PARAM, args.MACHINE)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 3607, in run_iter
run_train (ii, jdata, mdata)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 610, in run_train
submission.run_submission()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 208, in run_submission
self.handle_unexpected_submission_state()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 244, in handle_unexpected_submission_state
raise RuntimeError(
RuntimeError: Meet errors will handle unexpected submission state.
Debug information: remote_root==/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work/04d8a682beb8bfafb345306a91319a282951fae4.
Debug information: submission_hash==04d8a682beb8bfafb345306a91319a282951fae4.
This is my first time submitting a dpgen job in a slurm queue, and here's the machine.json down below:
{
"api_version": "1.0",
"train": [
{
"command": "dp",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"gpu_per_node": 1,
"queue_name": "gpu",
"custom_flags": ["#SBATCH -o output.%j", "#SBATCH -e err -o out"],
"group_size": 1,
"source_list": ["
/WORKSPACE/zjl9/DPGEN/deepmd.sh"],/WORKSPACE/zjl9/DPGEN/deepmd.sh"],"module_list": [],
"time_limit": "23:0:0"
}
}
],
"model_devi": [
{
"command": "lmp",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"gpu_per_node": 1,
"queue_name": "gpu",
"exclude_list": [],
"group_size": 10,
"source_list": ["
"module_list": [],
"time_limit": "23:0:0"
}
}
],
"fp": [
{
"command": "mpirun /share/apps/vasp/ips2018/u1/5.4.4/vasp_std",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"queue_name": "gencpu",
"group_size": 3,
"source_list": [],
"module_list": ["vasp/ips2018/u1/5.4.4"],
"time_limit": "120:0:0"
}
}
]
}
Thanks for any help
nohup.md
Beta Was this translation helpful? Give feedback.
All reactions