ECP-CANDLE
diff --git a/‎workflows/nt3_mlrMBO/README.md‎
Lines changed: 52 additions & 6 deletions b/‎workflows/nt3_mlrMBO/README.md‎
Lines changed: 52 additions & 6 deletions
diff --git a/‎workflows/nt3_mlrMBO/swift/ai_workflow.sh‎
Lines changed: 2 additions & 1 deletion b/‎workflows/nt3_mlrMBO/swift/ai_workflow.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎workflows/nt3_mlrMBO/swift/cori_workflow.sh‎
Lines changed: 2 additions & 1 deletion b/‎workflows/nt3_mlrMBO/swift/cori_workflow.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎workflows/nt3_mlrMBO/swift/cori_workflow3.sh‎
Lines changed: 2 additions & 2 deletions b/‎workflows/nt3_mlrMBO/swift/cori_workflow3.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎workflows/nt3_mlrMBO/swift/theta_workflow.sh‎
Lines changed: 3 additions & 4 deletions b/‎workflows/nt3_mlrMBO/swift/theta_workflow.sh‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎workflows/nt3_mlrMBO/swift/workflow.sh‎
Lines changed: 3 additions & 2 deletions b/‎workflows/nt3_mlrMBO/swift/workflow.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎workflows/p1b1_grid/data/settings.json‎
Lines changed: 4 additions & 4 deletions b/‎workflows/p1b1_grid/data/settings.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎workflows/p1b1_grid/python/computeStats.py‎
Lines changed: 6 additions & 0 deletions b/‎workflows/p1b1_grid/python/computeStats.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎workflows/p1b1_grid/python/determineParameters.py‎
Lines changed: 11 additions & 12 deletions b/‎workflows/p1b1_grid/python/determineParameters.py‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎workflows/p1b1_grid/python/evaluateOne.py‎
Lines changed: 15 additions & 6 deletions b/‎workflows/p1b1_grid/python/evaluateOne.py‎
Lines changed: 15 additions & 6 deletions
@@ -24,10 +24,10 @@ the Swift script to launch a NT3 run, and to
 
  For each run of the benchmark model, the following is produced:
 
- * `run.json` - a json file containing data describing the individual run: the
+ * `run.[run_id].json` - a json file containing data describing the individual run: the
  parameters for that run and per epoch details such as the validation loss. This
  file will be written to the output directory for that particular run (e.g.)
- `nt3_mlrMBO/experiments/E1/run_1_1_0/output/run.json`.
+ `nt3_mlrMBO/experiments/E1/run_1_1_0/output/run.1.1.0.json`.
 
 
 ## User requirements ##
@@ -286,22 +286,68 @@ cd Supervisor/workflows/nt3_mlrMBO/ext/EQ-R/eqr
 
 Launching the workflow:
 
-Edit
-`cori_workflow3.sh` setting the relevant variables as appropriate.  All easily
+1. Make a copy of `cori_workflow3.sh`
+2. Edit the copy setting the relevant variables there
+as appropriate.  All easily
 changed settings are delineated by the `USER SETTINGS START` and `USER SETTINGS END`
 markers.  Note that these variables can be easily overwritten from the calling
 environment (use `export` in your shell). By default these are set up for a short-ish
 debugging runs and will need to be changed for a production run.
+3. `source cori_settings.sh`
+4. Run the workflow by running your workflow script, passing an experiment id.
 
 An example:
 
 ```
 cd Supervisor/workflows/nt3_mlrMBO/swift
+cp cori_workflow3.sh my_cori_workflow.sh
+# edit my my_cori_workflow.sh
 source cori_settings.sh
-./cori_workflow.sh T1
+./my_cori_workflow.sh T1
 ```
 where T1 is the experiment ID.
 
 ### Running on Theta ###
 
-TODO
+* Download, install etc. the user requirements listed at the top of this
+document.
+
+All the system requirements (see above) have been installed on Theta for except
+for the EQ/R swift extension.
+
+* Compile the EQ/R swift-t extension.
+```
+cd Supervisor/workflows/nt3_mlrMBO/ext/EQ-R/eqr
+./bootstrap
+source ./theta_build_settings.sh
+./configure
+make install
+```
+
+Launching the workflow:
+
+1. Make a copy of `theta_workflow.sh`
+2. Edit the copy setting the relevant variables there
+as appropriate.  All easily
+changed settings are delineated by the `USER SETTINGS START` and `USER SETTINGS END`
+markers.  Note that these variables can be easily overwritten from the calling
+environment (use `export` in your shell). By default these are set up for a short-ish
+debugging runs and will need to be changed for a production run.
+3. Run the workflow by running your workflow script, passing an experiment id.
+
+An example:
+
+```
+cd Supervisor/workflows/nt3_mlrMBO/swift
+cp theta_workflow.sh my_theta_workflow.sh
+# edit my theta_workflow.sh if necesasry
+./theta_workflow.sh T1
+```
+
+where T1 is the experiment ID.
+
+Note that Theta use the _ai_-version of the workflow. The benchmark is launched
+using Supervisor/workflows/nt3_mlrMBO/scripts/theta_run_model.sh. In there, the
+`PYTHONHOME` shell variable can be changed to specify a different python installation to
+run the model with. If you do change the python installation, the python
+system requirements mentioned above will need to be satisfied.
@@ -57,7 +57,8 @@ fi
 #export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1
 
 export EXPID=$1
-export TURBINE_OUTPUT=$EMEWS_PROJECT_ROOT/experiments/$EXPID
+export TURBINE_OUTPUT_ROOT=${TURBINE_OUTPUT_ROOT:-$EMEWS_PROJECT_ROOT/experiments}
+export TURBINE_OUTPUT=$TURBINE_OUTPUT_ROOT/$EXPID
 check_directory_exists
 
 export TURBINE_JOBNAME="${EXPID}_job"
 
@@ -55,7 +55,8 @@ fi
 #export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1
 
 export EXPID=$1
-export TURBINE_OUTPUT=$EMEWS_PROJECT_ROOT/experiments/$EXPID
+export TURBINE_OUTPUT_ROOT=${TURBINE_OUTPUT_ROOT:-$EMEWS_PROJECT_ROOT/experiments}
+export TURBINE_OUTPUT=$TURBINE_OUTPUT_ROOT/$EXPID
 check_directory_exists
 
 export TURBINE_JOBNAME="${EXPID}_job"
 
@@ -55,8 +55,8 @@ fi
 #export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1
 
 export EXPID=$1
-#export TURBINE_OUTPUT=$EMEWS_PROJECT_ROOT/experiments/$EXPID
-export TURBINE_OUTPUT=/project/projectdirs/m2759/pbalapra/experiments/$EXPID
+export TURBINE_OUTPUT_ROOT=${TURBINE_OUTPUT_ROOT:-$EMEWS_PROJECT_ROOT/experiments}
+export TURBINE_OUTPUT=$TURBINE_OUTPUT_ROOT/$EXPID
 check_directory_exists
 
 export TURBINE_JOBNAME="${EXPID}_job"
 
@@ -23,8 +23,7 @@ export PROCS=${PROCS:-320}
 # Cori has 32 cores per node, 128GB per node
 export PPN=${PPN:-1}
 
-#export QUEUE="default"
-export QUEUE="R.candle_res"
+export QUEUE=${QUEUE:-default}
 export WALLTIME=${WALLTIME:-05:00:00}
 
 
@@ -59,8 +58,8 @@ fi
 export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1
 
 export EXPID=$1
-export TURBINE_OUTPUT=/lus/theta-fs0/projects/Candle_ECP/experiments/$EXPID
-#export TURBINE_OUTPUT=$EMEWS_PROJECT_ROOT/experiments/$EXPID
+export TURBINE_OUTPUT_ROOT=${TURBINE_OUTPUT_ROOT:-$EMEWS_PROJECT_ROOT/experiments}
+export TURBINE_OUTPUT=$TURBINE_OUTPUT_ROOT/$EXPID
 check_directory_exists
 
 export TURBINE_JOBNAME="${EXPID}_job"
 
@@ -57,7 +57,8 @@ fi
 #export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1
 
 export EXPID=$1
-export TURBINE_OUTPUT=$EMEWS_PROJECT_ROOT/experiments/$EXPID
+export TURBINE_OUTPUT_ROOT=${TURBINE_OUTPUT_ROOT:-$EMEWS_PROJECT_ROOT/experiments}
+export TURBINE_OUTPUT=$TURBINE_OUTPUT_ROOT/$EXPID
 check_directory_exists
 
 export TURBINE_JOBNAME="${EXPID}_job"
@@ -80,7 +81,7 @@ export RESIDENT_WORK_RANKS=$(( PROCS - 2 ))
 EQR=$EMEWS_PROJECT_ROOT/ext/EQ-R
 
 CMD_LINE_ARGS="$* -pp=$PROPOSE_POINTS -mi=$MAX_ITERATIONS -mb=$MAX_BUDGET -ds=$DESIGN_SIZE "
-CMD_LINE_ARGS+="-param_set_file=$PARAM_SET_FILE -model_name=$MODEL_NAME -script_file=$SCRIPT_FILE"
+CMD_LINE_ARGS+="-param_set_file=$PARAM_SET_FILE -model_name=$MODEL_NAME -script_file=$SCRIPT_FILE -exp_id=$EXPID"
 
 if [ -n "$MACHINE" ]; then
   MACHINE="-m $MACHINE"
 
@@ -1,9 +1,9 @@
 {
     "parameters":
           { 
-          	"1": [2,4,6],
-            "2": [15, 25,50,75],
-            "3": [2000, 1000], 
-            "4": [600, 400]
+          	"epochs": [2, 4, 8 ],
+            "batch_size": [20, 40],
+            "N1": [1000, 2000], 
+            "NE": [500]
           }
 }
@@ -15,6 +15,12 @@ def computeStats(swiftArrayAsString):
     for a in A:
         vals += [A[a]]
     print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals))))
+    
+    filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt"
+    # writing the val loss to the output file
+    with open(filename, 'w') as the_file:
+        the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals))))
+
 
 
 if (len(sys.argv) < 2):
 
@@ -12,11 +12,15 @@ def loadSettings(settingsFilename):
         print("PWD is: '%s'" % os.getcwd())
         sys.exit(1)
     try:
-        params = settings['parameters']
+        epochs = settings['parameters']["epochs"]
+        batch_size = settings['parameters']["batch_size"]
+        N1 = settings['parameters']["N1"]
+        NE = settings['parameters']["NE"]        
+
     except KeyError as e:
         print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e)))
         sys.exit(1)
-    return(params)
+    return(epochs, batch_size, N1, NE)
 
 def expand(Vs, fr, to, soFar):
     soFarNew = []
@@ -40,16 +44,11 @@ def expand(Vs, fr, to, soFar):
 settingsFilename = sys.argv[1]
 paramsFilename   = sys.argv[2]
 
-params = loadSettings(settingsFilename)
-values = {}
-for i in range(1, len(params)+1):
-    try:
-         As = params[str(i)]
-    except:
-         print('Did not find parameter %i in settings file'%i)
-         sys.exit(1)
-    values[i] = As
-results = expand(values, 1, len(params), [''])
+epochs, batch_size, N1, NE = loadSettings(settingsFilename)
+
+values = {1:epochs, 2: batch_size, 3: N1, 4: NE}
+print values
+results = expand(values, 1, len(values), [''])
 result = ':'.join(results)
 
 with open(paramsFilename, 'w') as the_file:
 
@@ -1,7 +1,7 @@
 import sys
 import p1b1_runner
-import json
-
+import json, os
+import socket
 
 if (len(sys.argv) < 3):
 	print('requires arg1=param and arg2=filename')
@@ -12,7 +12,7 @@
 
 # print (parameterString)
 print ("filename is " + filename)
-
+print (socket.gethostname())
 
 integs = [int(x) for x in parameterString.split(',')]
 print (integs)
@@ -21,16 +21,25 @@
 hyper_parameter_map['framework'] = 'keras'
 hyper_parameter_map['batch_size'] = integs[1]
 hyper_parameter_map['dense'] = [integs[2], integs[3]] 
-hyper_parameter_map['save'] = './output'
-
+hyper_parameter_map['run_id'] = parameterString
+# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] 
+hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK']
+sys.argv = ['p1b1_runner']
 val_loss = p1b1_runner.run(hyper_parameter_map)
 print (val_loss)
+
+sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] + "/procname-" + parameterString
+with open(sfn, 'w') as sfile:
+    sfile.write(socket.getfqdn())
+    proc_id = "-"+ str(os.getpid())
+    sfile.write(proc_id)
+
 # works around this error:
 # https://github.com/tensorflow/tensorflow/issues/3388
 from keras import backend as K
 K.clear_session()
 
-# writing the val loss to the output file
+# writing the val loss to the output file (result-*)
 with open(filename, 'w') as the_file:
     the_file.write(repr(val_loss))
Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,9 @@`
`1`	`1`	`{`
`2`	`2`	`"parameters":`
`3`	`3`	`{`
`4`		`- "1": [2,4,6],`
`5`		`- "2": [15, 25,50,75],`
`6`		`- "3": [2000, 1000],`
`7`		`- "4": [600, 400]`
	`4`	`+ "epochs": [2, 4, 8 ],`
	`5`	`+ "batch_size": [20, 40],`
	`6`	`+ "N1": [1000, 2000],`
	`7`	`+ "NE": [500]`
`8`	`8`	`}`
`9`	`9`	`}`