Replies: 1 comment
-
I found a solution. Submitting the training step as a separate job on SLURM doesn't work. It needs to be run locally. |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Hi,
I'm trying to run the tutorial for DP-Gen, and I'm trying to run the command:
$dpgen run param.json machine.json
It successfully creates the 4 forcefield models, but it would not stop trying to train them even after it finished generating the forcefields. In other words, It doesn't move on to the exploration step but loops back to try to train them again. It would submit hundreds of jobs to Slurm until I stop the program manually.
Any insight? here is my code for machine.json
{
"api_version": "1.0",
"deepmd_version": "2.1.1",
"train" :[
{
"command": "dp",
"machine": {
"batch_type": "Slurm",
"context_type": "local",
"local_root" : "./",
"remote_root": "/work/07693/echen/ls6/dpgen_example/worktest"
},
"resources": {
"number_node": 1,
"cpu_per_node": 16,
"group_size": 1,
"custom_flags" : ["#SBATCH --time=01:00:00"],
"queue_name": "development",
"source_list": ["/work/07693/echen/ls6/dpgen_example/work2/deepmd.env"]
}
}
],
"model_devi":[
{
"command": "lmp",
"machine": {
"batch_type": "Slurm",
"context_type": "local",
"local_root" : "./",
"remote_root": "/work/07693/echen/ls6/dpgen_example/worktest"
},
"resources": {
"number_node": 1,
"cpu_per_node": 4,
"queue_name": "vm-small",
"custom_flags" : ["#SBATCH --time=01:00:00"],
"group_size": 10,
"source_list": ["/work/07693/echen/ls6/dpgen_example/work2/lammps.env"]
}
}
],
"fp":[
{
"command": "mpirun -n 32 vasp_std",
"machine": {
"batch_type": "Slurm",
"context_type": "local",
"local_root" : "./",
"remote_root": "/work/07693/echen/ls6/dpgen_example/worktest"
},
"resources": {
"number_node": 1,
"cpu_per_node": 32,
"queue_name": "normal",
"custom_flags" : ["#SBATCH --time=01:00:00"],
"group_size": 4,
"source_list": ["/work/07693/echen/ls6/dpgen_example/work2/vasp.env"]
}
}
]
}
Here is the Param.json file:
{
"type_map": ["H","C"],
"mass_map": [1,12],
"init_data_prefix": "../",
"init_data_sys": ["init/CH4.POSCAR.01x01x01/02.md/sys-0004-0001/deepmd"],
"sys_configs_prefix": "../",
"sys_configs": [
["init/CH4.POSCAR.01x01x01/01.scale_pert/sys-0004-0001/scale-1.000/00000*/POSCAR"],
["init/CH4.POSCAR.01x01x01/01.scale_pert/sys-0004-0001/scale-1.000/00001*/POSCAR"]
],
"_comment": " that's all ",
"numb_models": 4,
"default_training_param": {
"model": {
"type_map": ["H","C"],
"descriptor": {
"type": "se_a",
"sel": [16,4],
"rcut_smth": 0.5,
"rcut": 5.0,
"neuron": [120,120,120],
"resnet_dt": true,
"axis_neuron": 12,
"seed": 1
},
"fitting_net": {
"neuron": [25,50,100],
"resnet_dt": false,
"seed": 1
}
},
"learning_rate": {
"type": "exp",
"start_lr": 0.001,
"decay_steps": 5000
},
"loss": {
"start_pref_e": 0.02,
"limit_pref_e": 2,
"start_pref_f": 1000,
"limit_pref_f": 1,
"start_pref_v": 0.0,
"limit_pref_v": 0.0
},
"training": {
"stop_batch": 2000,
"disp_file": "lcurve.out",
"disp_freq": 1000,
"numb_test": 4,
"save_freq": 1000,
"save_ckpt": "model.ckpt",
"disp_training": true,
"time_training": true,
"profiling": false,
"profiling_file": "timeline.json",
"_comment": "that's all"
}
},
"model_devi_dt": 0.002,
"model_devi_skip": 0,
"model_devi_f_trust_lo": 0.05,
"model_devi_f_trust_hi": 0.15,
"model_devi_e_trust_lo": 10000000000.0,
"model_devi_e_trust_hi": 10000000000.0,
"model_devi_clean_traj": true,
"model_devi_jobs": [
{"sys_idx": [0],"temps": [100],"press": [1.0],"trj_freq": 10,"nsteps": 300,"ensemble": "nvt","_idx": "00"},
{"sys_idx": [1],"temps": [100],"press": [1.0],"trj_freq": 10,"nsteps": 3000,"ensemble": "nvt","_idx": "01"}
],
"fp_style": "vasp",
"shuffle_poscar": false,
"fp_task_max": 20,
"fp_task_min": 5,
"fp_pp_path": "./",
"fp_pp_files": ["POTCAR_H","POTCAR_C"],
"fp_incar": "./INCAR_methane"
}
Beta Was this translation helpful? Give feedback.
All reactions