Skip to content

Commit a7cb9d2

Browse files
committed
update readme
1 parent be5b267 commit a7cb9d2

File tree

10 files changed

+684
-1
lines changed

10 files changed

+684
-1
lines changed

P1B1/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,12 @@ Epoch 1/2
6262
Epoch 2/2
6363
2400/2400 [==============================] - 8s - loss: 0.0376 - val_loss: 0.0377
6464
```
65+
66+
### Preliminary performance
67+
68+
Jonathan has done some network topology sweeps in LBANN. See the `contrib/`
69+
directory for details. Here's his plot on reconstruction error vs
70+
different network architectures.
71+
72+
![Autoencoder reconstruction error vs different network architectures](https://raw.githubusercontent.com/ECP-CANDLE/Benchmarks/master/P1B1/images/network_sweep_ex3.png)
73+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
## Create 5-fold cross validation partions
2+
## to support persistant/stable held out test data
3+
4+
5+
### change to local data location of GDC
6+
cd /p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite/partitions
7+
8+
## full run
9+
## create and enter a temporary working subdirectory
10+
## generate partitions, output will be:
11+
## gdc_rand5.train.fea.X, gdc_rand5.test.fea.X, gdc_rand5.train.lab.X, gdc_rand5.test.lab.X
12+
## for partition X, "fea" files store the data matrix, and "lab" retains labels
13+
python2.7 /p/lscratchf/allen99/lbexp/r_partition2.py --features ../X --labels ../y --partitions 5 --outname gdc_rand5
14+
15+
##
16+
## conduct a simple model topology parameter sweep
17+
##
18+
## change to working directory
19+
## run parameter sweep
20+
## output is reconstruction error and stored in lot files of the form: ae_log.Y.out.\*.hp.X
21+
## where Y is a unique process ID and X is the cross-validation partition
22+
python lbannae_sweep.py 16 5 gdc_rand5 /p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite >& run_log.txt
23+
24+
## parses LBANN output to report reconstruction error output to generate summary report
25+
## generate average reconstruction error per epoch, with standard deviation
26+
find . -name ae_log.20468.out.\*.hp.4 | python parse_lbann_ae.py e_log.20468.out.hp.4
27+
find . -name ae_log.20468.out.\*.hp.3 | python parse_lbann_ae.py e_log.20468.out.hp.3
28+
find . -name ae_log.20468.out.\*.hp.2 | python parse_lbann_ae.py e_log.20468.out.hp.2
29+
find . -name ae_log.20468.out.\*.hp.0 | python parse_lbann_ae.py e_log.20468.out.hp.0
30+
find . -name ae_log.20468.out.\*.hp.1 | python parse_lbann_ae.py e_log.20468.out.hp.1
31+
32+
## Merge files to do a direct compare and plot with matplotlib
33+
## plots reconstruction error (y-axis) over epoch (x-axis)
34+
python3 plot_recon_err.py "e_log.20468.out.hp.0,400x300x100 e_log.20468.out.hp.1,500x100 e_log.20468.out.hp.2,1000x500 e_log.20468.out.hp.3,1000x500x250x100" ex3.png
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
###
2+
###
3+
### Python script to launch a simple sweep of hyper paramters
4+
###
5+
###
6+
import os,sys
7+
from subprocess import call
8+
9+
nodes=int(sys.argv[1]) ## number of compute nodes to use
10+
partitions=int(sys.argv[2]) ## expected number of cross validation partitions
11+
## name of the train/testing cross validation files (format is: "filebn".train.fea.X or "filebn".test.fea.X)
12+
## where
13+
filebn=sys.argv[3]
14+
ddir=sys.argv[4]
15+
16+
## original
17+
ddir="/p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite"
18+
19+
aecmd="/p/lscratchf/allen99/lbexp/run_lbann_ae.sh"
20+
21+
##source code command line references
22+
##LearnRateMethod = Input("--learning-rate-method", "1 - Adagrad, 2 - RMSprop, 3 - Adam", LearnRateMethod);
23+
##ActivationType = static_cast<activation_type>(Input("--activation-type", "1 - Sigmoid, 2 - Tanh, 3 - reLU, 4 - id", static_cast<int>(ActivationType)));
24+
25+
## -f -> data location
26+
## -e -> epoch
27+
## -b -> mini-match
28+
## -a -> activiation type
29+
## -r -> learning rate
30+
## -j -> learning rate decay
31+
## -k -> fraction of training data to use for training
32+
## -g -> dropout rate
33+
## -q -> learning rate method
34+
## -n -> network topology : specify number of nodes in each hidden layer
35+
## original parameters
36+
param_lst=[]
37+
params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 400,300,100" + " -f "+ddir
38+
param_lst.append(params)
39+
params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 500,100" + " -f "+ddir
40+
param_lst.append(params)
41+
params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 1000,500" + " -f "+ddir
42+
param_lst.append(params)
43+
params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 1000,500,250,100" + " -f "+ddir
44+
param_lst.append(params)
45+
params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 100" + " -f "+ddir
46+
param_lst.append(params)
47+
48+
for hpi in range(len(param_lst)) :
49+
for parti in range(partitions) :
50+
tr_file=filebn+".train.fea."+str(parti)
51+
ts_file=filebn+".test.fea."+str(parti)
52+
out_name="ae_log."+str(os.getpid())+".out."+str(parti) + ".hp."+str(hpi)
53+
run_cmd="sbatch -N"+str(nodes) + " -t 1440 --clear-ssd --msr-safe --output="+out_name+" "+aecmd+" -x "+tr_file+" -y " +ts_file+" "+param_lst[hpi]
54+
print run_cmd
55+
call(run_cmd,shell=True)
56+
57+
#sbatch -N$nodes -t 1440 --clear-ssd --msr-safe --output="slurm-lbann-nci_ae_tst-%j.v2.out" $bdir/run_lbann_ae.sh -x gdc_rand5.train.fea.0 -y gdc_rand5.test.fea.0 $params
51.3 KB
Loading
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#######################################################################################
2+
## Parse reconstruction error from LBANN output
3+
##
4+
## program is designed to take X LBANN output files from X-fold cross validation
5+
##
6+
## and compute the average reconstruction error for each Epoch with standard deviation
7+
##
8+
#######################################################################################
9+
import sys,math
10+
11+
def getParams(fn) :
12+
epochs,nlayers=-1,-1
13+
fh = open(fn)
14+
for line in fh :
15+
line=line.rstrip()
16+
kw="--network "
17+
kw_len = len(kw)
18+
sidx = line.find(kw)
19+
if sidx != -1 :
20+
eidx = line.find(" ",sidx+kw_len)
21+
nstr=line[sidx+kw_len:eidx]
22+
vals=nstr.split(',')
23+
nlayers=len(vals)
24+
kw="--num-epochs "
25+
kw_len = len(kw)
26+
sidx = line.find(kw)
27+
if sidx != -1 :
28+
eidx = line.find(" ",sidx+kw_len)
29+
nstr=line[sidx+kw_len:eidx]
30+
epochs=int(nstr)
31+
print "chk",sidx,eidx,nstr,epochs
32+
break
33+
34+
assert nlayers != -1 and epochs != -1
35+
return nlayers,epochs
36+
37+
def getCost(fn,num_layers) :
38+
cost_val={}
39+
fh = open(fn)
40+
epoch_val=-1
41+
for line in fh :
42+
line=line.rstrip()
43+
kw="Phase [" + str(num_layers-1)+"] Epoch ["
44+
kw_len = len(kw)
45+
sidx = line.find(kw)
46+
if sidx != -1 :
47+
eidx = line.find("]",sidx+kw_len)
48+
nstr=line[sidx+kw_len:eidx]
49+
epoch_val=int(nstr)
50+
#print "save e",epoch_val
51+
kw="Testing model 0 average reconstruction cost: "
52+
kw_len = len(kw)
53+
sidx = line.find(kw)
54+
if sidx != -1 and epoch_val != -1 :
55+
nstr=line[sidx+kw_len:]
56+
recon_val=float(nstr)
57+
assert epoch_val != -1
58+
#print "save",epoch_val,recon_val
59+
cost_val.setdefault(epoch_val,recon_val)
60+
61+
assert cost_val != {}
62+
return cost_val
63+
64+
65+
flst=[]
66+
for fn in sys.stdin :
67+
fn=fn.rstrip()
68+
flst.append(fn)
69+
70+
71+
cost_save=[]
72+
max_epoch=-1
73+
for fn in flst :
74+
num_layers,epochs=getParams(fn)
75+
cost = getCost(fn,num_layers)
76+
cost_save.append(cost)
77+
if epochs > max_epoch :
78+
max_epoch = epochs
79+
80+
oname=sys.argv[1]
81+
oh=open(oname,"w")
82+
for epi in range(max_epoch) :
83+
cnt,sval=0,0
84+
for val in cost_save :
85+
if val.has_key(epi) :
86+
sval += val[epi]
87+
cnt += 1
88+
avg = sval / cnt
89+
sval = 0
90+
for val in cost_save :
91+
if val.has_key(epi) :
92+
sval += (val[epi]-avg)*(val[epi]-avg)
93+
stdev=0
94+
if cnt > 1 :
95+
stdev = math.sqrt(sval / (cnt-1))
96+
97+
res = str(epi)+" "+str(avg)+" "+str(stdev)
98+
oh.write(res + "\n")
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
###
2+
### Simple matplotlib routine to plot reconstruction error (y-axis) with standard deviation over epochs (x-axis)
3+
###
4+
###
5+
### example useage for 3 different autoencoder models:
6+
### python3 ../plot_recon_err.py "e_log.20468.out.hp.0,400x300x100 e_log.20468.out.hp.1,500x100 e_log.20468.out.hp.2,1000x500" ex1.png
7+
import sys
8+
import matplotlib.pyplot as plt
9+
10+
11+
flst=sys.argv[1]
12+
output=sys.argv[2]
13+
files=[]
14+
if flst.find(" ") != -1 :
15+
files=flst.split(" ")
16+
else :
17+
files.append(flst)
18+
19+
fig=plt.figure()
20+
ax=fig.add_subplot(1,1,1)
21+
lstyle='dotted'
22+
cval=["red","blue","green","purple","orange","black"]
23+
cnt=0
24+
for file_all in files :
25+
file,desc=file_all.split(',')
26+
fh=open(file)
27+
xval,yval,std=[],[],[]
28+
for line in fh :
29+
line=line.rstrip()
30+
vals=line.split(" ")
31+
xval.append(int(vals[0]))
32+
yval.append(float(vals[1]))
33+
std.append(float(vals[2]))
34+
35+
plt.errorbar(xval,yval,yerr=std,ls=lstyle,color=cval[cnt],label=desc)
36+
cnt+=1
37+
38+
plt.xlabel('Epoch')
39+
plt.ylabel('Reconstruction Error')
40+
plt.legend()
41+
plt.savefig(output,dpi=100)
42+
plt.show()

0 commit comments

Comments
 (0)