Skip to content

Commit 411daee

Browse files
committed
added submission script for horovod implemented st1
1 parent ba993e1 commit 411daee

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

Pilot1/ST1/sub_hvd.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
module load conda/2022-09-08
4+
conda activate
5+
6+
DATA_PATH=/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten
7+
8+
TFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train
9+
VFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val
10+
11+
EP=400
12+
NUMHEAD=16
13+
DR_TB=0.1
14+
DR_ff=0.1
15+
16+
ACT=elu
17+
DROP=False
18+
LR=0.0000025
19+
LOSS=mean_squared_error
20+
HVDSWITCH=True
21+
22+
if [$HVDSWITCH = False]; then
23+
python run.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH
24+
25+
else
26+
NP=8
27+
PPN=4
28+
OUT=logfile.log
29+
mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python run.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT
30+
31+
fi
32+

0 commit comments

Comments
 (0)