ECP-CANDLE
diff --git a/‎P1B1/README.md‎
Lines changed: 9 additions & 0 deletions b/‎P1B1/README.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎P1B1/contrib/Jonathan_Allen/README.txt‎
Lines changed: 34 additions & 0 deletions b/‎P1B1/contrib/Jonathan_Allen/README.txt‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎P1B1/contrib/Jonathan_Allen/lbannae_sweep.py‎
Lines changed: 57 additions & 0 deletions b/‎P1B1/contrib/Jonathan_Allen/lbannae_sweep.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎P1B1/contrib/Jonathan_Allen/network_sweep_ex3.png‎
51.3 KB b/‎P1B1/contrib/Jonathan_Allen/network_sweep_ex3.png‎
51.3 KB
diff --git a/‎P1B1/contrib/Jonathan_Allen/parse_lbann_ae.py‎
Lines changed: 98 additions & 0 deletions b/‎P1B1/contrib/Jonathan_Allen/parse_lbann_ae.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎P1B1/contrib/Jonathan_Allen/plot_recon_err.py‎
Lines changed: 42 additions & 0 deletions b/‎P1B1/contrib/Jonathan_Allen/plot_recon_err.py‎
Lines changed: 42 additions & 0 deletions
@@ -62,3 +62,12 @@ Epoch 1/2
 Epoch 2/2
 2400/2400 [==============================] - 8s - loss: 0.0376 - val_loss: 0.0377
 ```
+
+### Preliminary performance
+
+Jonathan has done some network topology sweeps in LBANN. See the `contrib/`
+directory for details. Here's his plot on reconstruction error vs
+different network architectures.
+
+![Autoencoder reconstruction error vs different network architectures](https://raw.githubusercontent.com/ECP-CANDLE/Benchmarks/master/P1B1/images/network_sweep_ex3.png)
+
@@ -0,0 +1,34 @@
+## Create 5-fold cross validation partions 
+## to support persistant/stable held out test data
+
+
+### change to local data location of GDC
+cd /p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite/partitions
+
+## full run
+## create and enter a temporary working subdirectory
+## generate partitions, output will be: 
+## gdc_rand5.train.fea.X, gdc_rand5.test.fea.X, gdc_rand5.train.lab.X, gdc_rand5.test.lab.X
+## for partition X, "fea" files store the data matrix, and "lab" retains labels 
+python2.7 /p/lscratchf/allen99/lbexp/r_partition2.py --features ../X  --labels ../y --partitions 5 --outname gdc_rand5
+
+## 
+## conduct a simple model topology parameter sweep
+##
+## change to working directory
+## run parameter sweep
+## output is reconstruction error and stored in lot files of the form: ae_log.Y.out.\*.hp.X
+## where Y is a unique process ID and X is the cross-validation partition
+python lbannae_sweep.py 16 5 gdc_rand5 /p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite >& run_log.txt
+
+## parses LBANN output to report reconstruction error output to generate summary report
+## generate average reconstruction error per epoch, with standard deviation
+find . -name ae_log.20468.out.\*.hp.4 | python parse_lbann_ae.py e_log.20468.out.hp.4
+find . -name ae_log.20468.out.\*.hp.3 | python parse_lbann_ae.py e_log.20468.out.hp.3
+find . -name ae_log.20468.out.\*.hp.2 | python parse_lbann_ae.py e_log.20468.out.hp.2
+find . -name ae_log.20468.out.\*.hp.0 | python parse_lbann_ae.py e_log.20468.out.hp.0
+find . -name ae_log.20468.out.\*.hp.1 | python parse_lbann_ae.py e_log.20468.out.hp.1
+
+## Merge files to do a direct compare and plot with matplotlib
+## plots reconstruction error (y-axis) over epoch (x-axis)
+python3 plot_recon_err.py "e_log.20468.out.hp.0,400x300x100 e_log.20468.out.hp.1,500x100 e_log.20468.out.hp.2,1000x500 e_log.20468.out.hp.3,1000x500x250x100" ex3.png
@@ -0,0 +1,57 @@
+###
+###
+###  Python script to launch a simple sweep of hyper paramters
+###
+###
+import os,sys
+from subprocess import call
+
+nodes=int(sys.argv[1])   ## number of compute nodes to use
+partitions=int(sys.argv[2])  ## expected number of cross validation partitions
+## name of the train/testing cross validation files (format is: "filebn".train.fea.X or "filebn".test.fea.X)
+## where 
+filebn=sys.argv[3] 
+ddir=sys.argv[4]
+
+## original
+ddir="/p/lscratchf/allen99/anlftp/public/datasets/GDC/data_frames/BySite"
+
+aecmd="/p/lscratchf/allen99/lbexp/run_lbann_ae.sh"
+
+##source code command line references
+##LearnRateMethod = Input("--learning-rate-method", "1 - Adagrad, 2 - RMSprop, 3 - Adam", LearnRateMethod);
+##ActivationType = static_cast<activation_type>(Input("--activation-type", "1 - Sigmoid, 2 - Tanh, 3 - reLU, 4 - id", static_cast<int>(ActivationType)));
+
+## -f -> data location
+## -e -> epoch
+## -b -> mini-match
+## -a -> activiation type 
+## -r -> learning rate
+## -j -> learning rate decay
+## -k -> fraction of training data to use for training
+## -g -> dropout rate
+## -q -> learning rate method
+## -n -> network topology : specify number of nodes in each hidden layer
+## original parameters
+param_lst=[]
+params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 400,300,100" + " -f "+ddir
+param_lst.append(params)
+params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 500,100" + " -f "+ddir
+param_lst.append(params)
+params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 1000,500" + " -f "+ddir
+param_lst.append(params)
+params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 1000,500,250,100" + " -f "+ddir
+param_lst.append(params)
+params="-e 100 -g 0.1 -b 50 -a 3 -r 0.0001 -j 0.5 -g -1 -q 1 -n 100 -k 0.75 -n 100" + " -f "+ddir
+param_lst.append(params)
+
+for hpi in range(len(param_lst)) :
+   for parti in range(partitions) :
+      tr_file=filebn+".train.fea."+str(parti)
+      ts_file=filebn+".test.fea."+str(parti)
+      out_name="ae_log."+str(os.getpid())+".out."+str(parti) + ".hp."+str(hpi)
+      run_cmd="sbatch -N"+str(nodes) + " -t 1440 --clear-ssd --msr-safe --output="+out_name+" "+aecmd+" -x "+tr_file+" -y " +ts_file+" "+param_lst[hpi]
+      print run_cmd
+      call(run_cmd,shell=True)
+
+#sbatch -N$nodes  -t 1440 --clear-ssd --msr-safe --output="slurm-lbann-nci_ae_tst-%j.v2.out" $bdir/run_lbann_ae.sh -x gdc_rand5.train.fea.0 -y gdc_rand5.test.fea.0 $params
@@ -0,0 +1,98 @@
+#######################################################################################
+## Parse reconstruction error from LBANN output
+##
+## program is designed to take X LBANN output files from X-fold cross validation
+##
+## and compute the average reconstruction error for each Epoch with standard deviation
+##
+#######################################################################################
+import sys,math
+
+def getParams(fn) :
+   epochs,nlayers=-1,-1
+   fh = open(fn)
+   for line in fh :
+      line=line.rstrip()
+      kw="--network "
+      kw_len = len(kw)
+      sidx = line.find(kw)
+      if sidx != -1 :
+         eidx = line.find(" ",sidx+kw_len)
+         nstr=line[sidx+kw_len:eidx]
+         vals=nstr.split(',')
+         nlayers=len(vals)
+      kw="--num-epochs "
+      kw_len = len(kw)
+      sidx = line.find(kw)
+      if sidx != -1 :
+         eidx = line.find(" ",sidx+kw_len)
+         nstr=line[sidx+kw_len:eidx]
+         epochs=int(nstr)
+         print "chk",sidx,eidx,nstr,epochs
+         break
+   
+   assert nlayers != -1 and epochs != -1
+   return nlayers,epochs
+
+def getCost(fn,num_layers) :
+   cost_val={}
+   fh = open(fn)
+   epoch_val=-1
+   for line in fh :
+      line=line.rstrip()
+      kw="Phase [" + str(num_layers-1)+"] Epoch ["
+      kw_len = len(kw)
+      sidx = line.find(kw)
+      if sidx != -1 :
+         eidx = line.find("]",sidx+kw_len)
+         nstr=line[sidx+kw_len:eidx]
+         epoch_val=int(nstr)
+         #print "save e",epoch_val
+      kw="Testing model 0 average reconstruction cost: "
+      kw_len = len(kw)
+      sidx = line.find(kw)
+      if sidx != -1 and epoch_val != -1 :
+         nstr=line[sidx+kw_len:]
+         recon_val=float(nstr)
+         assert epoch_val != -1 
+         #print "save",epoch_val,recon_val
+         cost_val.setdefault(epoch_val,recon_val)
+
+   assert cost_val != {}
+   return cost_val
+
+
+flst=[]
+for fn in sys.stdin :
+   fn=fn.rstrip()
+   flst.append(fn)
+
+
+cost_save=[]
+max_epoch=-1
+for fn in flst :
+   num_layers,epochs=getParams(fn)
+   cost = getCost(fn,num_layers)
+   cost_save.append(cost)
+   if epochs > max_epoch :
+      max_epoch = epochs
+   
+oname=sys.argv[1]
+oh=open(oname,"w")
+for epi in range(max_epoch) :
+   cnt,sval=0,0
+   for val in cost_save :
+      if val.has_key(epi) :
+         sval += val[epi]
+         cnt += 1
+   avg = sval / cnt
+   sval = 0
+   for val in cost_save :
+      if val.has_key(epi) :
+         sval += (val[epi]-avg)*(val[epi]-avg)
+   stdev=0
+   if cnt > 1 :
+      stdev = math.sqrt(sval / (cnt-1))    
+
+   res = str(epi)+" "+str(avg)+" "+str(stdev)
+   oh.write(res + "\n")
@@ -0,0 +1,42 @@
+###
+### Simple matplotlib routine to plot reconstruction error (y-axis) with standard deviation over epochs (x-axis) 
+### 
+###
+### example useage for 3 different autoencoder models:
+### python3 ../plot_recon_err.py "e_log.20468.out.hp.0,400x300x100 e_log.20468.out.hp.1,500x100 e_log.20468.out.hp.2,1000x500" ex1.png
+import sys
+import matplotlib.pyplot as plt
+
+
+flst=sys.argv[1]
+output=sys.argv[2]
+files=[]
+if flst.find(" ") != -1 :
+      files=flst.split(" ")
+else :
+      files.append(flst)
+
+fig=plt.figure()
+ax=fig.add_subplot(1,1,1)
+lstyle='dotted'
+cval=["red","blue","green","purple","orange","black"]
+cnt=0
+for file_all in files :
+   file,desc=file_all.split(',')
+   fh=open(file)
+   xval,yval,std=[],[],[]
+   for line in fh :
+      line=line.rstrip()
+      vals=line.split(" ")
+      xval.append(int(vals[0]))
+      yval.append(float(vals[1]))
+      std.append(float(vals[2]))
+
+   plt.errorbar(xval,yval,yerr=std,ls=lstyle,color=cval[cnt],label=desc)
+   cnt+=1
+
+plt.xlabel('Epoch')
+plt.ylabel('Reconstruction Error')
+plt.legend()
+plt.savefig(output,dpi=100)
+plt.show()