diff --git a/workflows/rnd_or_grid/README.md b/workflows/rnd_or_grid/README.md new file mode 100644 index 00000000..792551f5 --- /dev/null +++ b/workflows/rnd_or_grid/README.md @@ -0,0 +1,13 @@ +# Simple grid or random parameter sweep with Swift for all the benchmarks, using command line. D type , which runs a parameter sweep. It calls command-line programs as follows: +- determineParameters.{sh,py}: Reads data/**settings.json** for sweep parameters, and return as a string for use by Swift program (sweep-parameters.txt) +- evaluateOne.{sh,py}: Runs a single experiment. (Calls the specified benchmark). +- computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. + +Usage: ./run ./run ex3_p1b1_grid p1b1 grid + +Notes: +**settings.json**: sweep parameters variation +1. json file must be present in the data folder and named as: _settings.json, samples files are available and must be modified as per needs. +2. Run directory will be created in the experiments folder +3. New variables can be introduced in the determineParameters.py and evaluateOne.py. +4. Variations of parameters must be specified in data/*.json files diff --git a/workflows/rnd_or_grid/data/nt3_settings.json b/workflows/rnd_or_grid/data/nt3_settings.json new file mode 100644 index 00000000..d505cd22 --- /dev/null +++ b/workflows/rnd_or_grid/data/nt3_settings.json @@ -0,0 +1,12 @@ +{ + "parameters": + { + "epochs": [1, 2 ], + "batch_size": [10, 20], + "classes": [2, 3] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p1b1_settings.json b/workflows/rnd_or_grid/data/p1b1_settings.json new file mode 100644 index 00000000..f8f7a7a8 --- /dev/null +++ b/workflows/rnd_or_grid/data/p1b1_settings.json @@ -0,0 +1,13 @@ +{ + "parameters": + { + "epochs": [1, 2, 8], + "batch_size": [20, 40], + "N1": [1000, 2000], + "NE": [500, 600] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p1b3_settings.json b/workflows/rnd_or_grid/data/p1b3_settings.json new file mode 100644 index 00000000..12a32b60 --- /dev/null +++ b/workflows/rnd_or_grid/data/p1b3_settings.json @@ -0,0 +1,14 @@ +{ + "parameters": + { + "epochs": [1, 2], + "batch_size": [50, 60], + "test_cell_split": [0.15, 0.25], + "drop": [0.1, 0.15] + }, + "samples": + { + "num": [2] + } + +} diff --git a/workflows/rnd_or_grid/data/p2b1_settings.json b/workflows/rnd_or_grid/data/p2b1_settings.json new file mode 100644 index 00000000..ce5a0af3 --- /dev/null +++ b/workflows/rnd_or_grid/data/p2b1_settings.json @@ -0,0 +1,13 @@ +{ + "parameters": + { + "epochs": [1, 2], + "batch_size": [32, 40], + "molecular_epochs": [1, 3], + "weight_decay": [0.0005, 0.0006] + }, + "samples": + { + "num": [2] + } +} diff --git a/workflows/rnd_or_grid/data/p3b1_settings.json b/workflows/rnd_or_grid/data/p3b1_settings.json new file mode 100644 index 00000000..c84ca192 --- /dev/null +++ b/workflows/rnd_or_grid/data/p3b1_settings.json @@ -0,0 +1,12 @@ +{ + "parameters": + { + "epochs": [1 , 2 ], + "batch_size": [20, 40] + }, + "samples": + { + "num": [2] + } +} + \ No newline at end of file diff --git a/workflows/rnd_or_grid/python/computeStats.py b/workflows/rnd_or_grid/python/computeStats.py new file mode 100644 index 00000000..f414c378 --- /dev/null +++ b/workflows/rnd_or_grid/python/computeStats.py @@ -0,0 +1,40 @@ +import sys +from collections import defaultdict +import json, os + +def extractVals(A): + B = defaultdict(dict) + A1 = A.split() + for n, val in zip(A1[0::2], A1[1::2]): + B[n] = float(val) + return(B) + +def computeStats(swiftArrayAsString): + A = extractVals(swiftArrayAsString) + vals = [] + for a in A: + vals += [A[a]] + print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + + filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" + # writing the val loss to the output file + with open(filename, 'w') as the_file: + the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + + + +if (len(sys.argv) < 2): + print('requires arg=dataFilename') + sys.exit(1) + +dataFilename = sys.argv[1] + +try: + with open(dataFilename, 'r') as the_file: + data = the_file.read() +except IOError as e: + print("Could not open: %s" % dataFilename) + print("PWD is: '%s'" % os.getcwd()) + +computeStats(data) + diff --git a/workflows/rnd_or_grid/python/determineParameters.py b/workflows/rnd_or_grid/python/determineParameters.py new file mode 100644 index 00000000..06ef2e1b --- /dev/null +++ b/workflows/rnd_or_grid/python/determineParameters.py @@ -0,0 +1,154 @@ +import sys, json, os +import random +import itertools + +# ===== Definitions ========================================================= +def expand(Vs, fr, to, soFar): + soFarNew = [] + for s in soFar: + if (Vs[fr] == None): + print ("ERROR: The order of json inputs and values must be preserved") + sys.exit(1) + for v in Vs[fr]: + if s == '': + soFarNew += [str(v)] + else: + soFarNew += [s+','+str(v)] + if fr==to: + return(soFarNew) + else: + return expand(Vs, fr+1, to, soFarNew) + +def generate_random(values, n_samples, benchmarkName): + # select '#samples' random numbers between the range provided in settings.json file + result = "" + param_listed = [] + for s in range(n_samples): + if(benchmarkName=="p1b1"): + # values = {1:epochs, 2: batch_size, 3: N1, 4: NE} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_N1= random.randint(values[3][0], values[3][1]) + t_NE= random.randint(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_N1) + ',' + str(t_NE) + elif(benchmarkName=="p1b3"): + # values = {1:epochs, 2: batch_size, 3: test_cell_split, 4: drop} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_tcs= random.uniform(values[3][0], values[3][1]) + t_drop= random.uniform(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_tcs) + ',' + str(t_drop) + elif(benchmarkName=="nt3"): + # values = {1:epochs, 2: batch_size, 3: classes} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_classes= random.randint(values[3][0], values[3][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_classes) + elif(benchmarkName=="p2b1"): + # values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + t_me= random.randint(values[3][0], values[3][1]) + t_wd= random.uniform(values[4][0], values[4][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_me) + ',' + str(t_wd) + elif(benchmarkName=="p3b1"): + # values = {1:epochs, 2: batch_size}//, 3: learning_rate, 4: n_fold} + t_epoch= random.randint(values[1][0], values[1][1]) + t_batch_size= random.randint(values[2][0], values[2][1]) + result+=str(t_epoch) + ',' + str(t_batch_size) + else: + print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') + sys.exit(1) + # Populate the result string for writing sweep-parameters file + param_listed += [str(result)] + result="" + return (param_listed) + +# ===== Main program ======================================================== +if (len(sys.argv) < 3): + print('requires arg1=settingsFilename and arg2=paramsFilename') + sys.exit(1) + +settingsFilename = sys.argv[1] +paramsFilename = sys.argv[2] +benchmarkName = sys.argv[3] +searchType = sys.argv[4] + +## Read in the variables from json file +#Trying to open the settings file +print("Reading settings: %s" % settingsFilename) +try: + with open(settingsFilename) as fp: + settings = json.load(fp) +except IOError as e: + print("Could not open: %s" % settingsFilename) + print("PWD is: '%s'" % os.getcwd()) + sys.exit(1) + +# Register new variables for any benchmark here +#Common variables +epochs = settings.get('parameters').get('epochs') +batch_size = settings.get('parameters').get('batch_size') +# P1B1 +N1 = settings.get('parameters').get('N1') +NE = settings.get('parameters').get('NE') +#NT3 +classes = settings.get('parameters').get('classes') +#P2B1 +molecular_epochs = settings.get('parameters').get('molecular_epochs') +weight_decay = settings.get('parameters').get('weight_decay') +#P3B1 +# learning_rate = settings.get('parameters').get('learning_rate') +# n_fold = settings.get('parameters').get('n_fold') +#P1B3 +test_cell_split = settings.get('parameters').get('test_cell_split') +drop = settings.get('parameters').get('drop') + +# For random scheme determine number of samples +samples = settings.get('samples', {}).get('num', None) + +## Done reading from file + +# Make values for computing grid sweep parameters +values = {} +if(benchmarkName=="p1b1"): + values = {1:epochs, 2: batch_size, 3: N1, 4: NE} +elif(benchmarkName=="p1b3"): + values = {1:epochs, 2: batch_size, 3: test_cell_split, 4: drop} +elif(benchmarkName=="nt3"): + values = {1:epochs, 2: batch_size, 3: classes} +elif(benchmarkName=="p2b1"): + values = {1:epochs, 2: batch_size, 3: molecular_epochs, 4: weight_decay} +elif(benchmarkName=="p3b1"): + values = {1:epochs, 2: batch_size} +else: + print('ERROR: Tried all possible benchmarks, Invalid benchmark name or json file') + sys.exit(1) + +# this (result) is : seperated string with all params +result = {} +# Determine parameter space based of search type +if(searchType == "grid"): + results = expand(values, 1, len(values), ['']) +elif(searchType =="random"): + if(samples == None): + print ("ERROR: Provide number of samples in json file") + sys.exit(1) + # result, results = generate_random(values, samples, benchmarkName) + results = generate_random(values, samples[0], benchmarkName) +else: + print ("ERROR: Invalid search type, specify either - grid or random") + sys.exit(1) + +counter=0 +for a, b in itertools.combinations(results, 2): + if(a == b): + print ("Warning: skipping -identical parameters found", counter) + results.remove(a) + +#These are final : seperated parameters for evaluation +result = ':'.join(results) + +with open(paramsFilename, 'w') as the_file: + the_file.write(result) + diff --git a/workflows/rnd_or_grid/python/evaluateOne.py b/workflows/rnd_or_grid/python/evaluateOne.py new file mode 100644 index 00000000..73d5db74 --- /dev/null +++ b/workflows/rnd_or_grid/python/evaluateOne.py @@ -0,0 +1,90 @@ +import sys +import json, os +import socket + + +if (len(sys.argv) < 3): + print('requires arg1=param and arg2=filename') + sys.exit(1) + +parameterString = sys.argv[1] +filename = sys.argv[2] +benchmarkName = sys.argv[3] + +integs = [float(x) for x in parameterString.split(',')] + +if (benchmarkName == "p1b1"): + import p1b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['dense'] = [int(integs[2]), int(integs[3])] + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p1b1_runner'] + val_loss = p1b1_runner.run(hyper_parameter_map) +elif (benchmarkName == "p1b3"): + import p1b3_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['test_cell_split'] = int(integs[2]) + hyper_parameter_map['drop'] = int(integs[3]) + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p1b3_runner'] + val_loss = p1b3_runner.run(hyper_parameter_map) +elif (benchmarkName == "p2b1"): + import p2b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['molecular_epochs'] = int(integs[2]) + hyper_parameter_map['weight_decay'] = integs[3] + hyper_parameter_map['run_id'] = parameterString + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p2b1_runner'] + val_loss = p2b1_runner.run(hyper_parameter_map) +elif (benchmarkName == "nt3"): + import nt3_tc1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['classes'] = int(integs[2]) + hyper_parameter_map['model_name'] = 'nt3' + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['nt3_runner'] + val_loss = nt3_tc1_runner.run(hyper_parameter_map) +elif (benchmarkName == "p3b1"): + import p3b1_runner + hyper_parameter_map = {'epochs' : int(integs[0])} + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['batch_size'] = int(integs[1]) + hyper_parameter_map['run_id'] = parameterString + # hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] + hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + sys.argv = ['p3b1_runner'] + val_loss = p3b1_runner.run(hyper_parameter_map) + +# print (parameterString) +# print ("filename is " + filename) +# print (str(os.getpid())) +print (val_loss) + +# sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+str(os.getpid()) + "/procname-" + parameterString +# with open(sfn, 'w') as sfile: +# sfile.write(socket.getfqdn()) +# proc_id = "-"+ str(os.getpid()) +# sfile.write(proc_id) + +# works around this error: +# https://github.com/tensorflow/tensorflow/issues/3388 +from keras import backend as K +K.clear_session() + +# writing the val loss to the output file (result-*) +with open(filename, 'w') as the_file: + the_file.write(repr(val_loss)) + diff --git a/workflows/rnd_or_grid/python/nt3_tc1_runner.py b/workflows/rnd_or_grid/python/nt3_tc1_runner.py new file mode 100644 index 00000000..678f8a00 --- /dev/null +++ b/workflows/rnd_or_grid/python/nt3_tc1_runner.py @@ -0,0 +1,73 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['nt3_tc1'] + +import json +import os +import numpy as np +import importlib +import runner_utils + +def import_pkg(framework, model_name): + if framework == 'keras': + module_name = "{}_baseline_keras2".format(model_name) + pkg = importlib.import_module(module_name) + # elif framework is 'mxnet': + # import nt3_baseline_mxnet + # pkg = nt3_baseline_keras_baseline_mxnet + # elif framework is 'neon': + # import nt3_baseline_neon + # pkg = nt3_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + return pkg + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + model_name = hyper_parameter_map['model_name'] + pkg = import_pkg(framework, model_name) + + runner_utils.format_params(hyper_parameter_map) + + # params is python dictionary + params = pkg.initialize_parameters() + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] + +if __name__ == '__main__': + param_string = sys.argv[1] + instance_directory = sys.argv[2] + model_name = sys.argv[3] + framework = sys.argv[4] + exp_id = sys.argv[5] + run_id = sys.argv[6] + benchmark_timeout = int(sys.argv[7]) + hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save') + hyper_parameter_map['model_name'] = model_name + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout + # clear sys.argv so that argparse doesn't object + sys.argv = ['nt3_tc1_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p1b1_runner.py b/workflows/rnd_or_grid/python/p1b1_runner.py new file mode 100644 index 00000000..20ce7e7d --- /dev/null +++ b/workflows/rnd_or_grid/python/p1b1_runner.py @@ -0,0 +1,46 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p1b1'] + +import json +import os +import p1b1 + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework is 'keras': + import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 + elif framework is 'mxnet': + import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet + elif framework is 'neon': + import p1b1_baseline_neon + pkg = p1b1_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + print(params) + history = pkg.run(params) + + if framework is 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] diff --git a/workflows/rnd_or_grid/python/p1b3_runner.py b/workflows/rnd_or_grid/python/p1b3_runner.py new file mode 100644 index 00000000..f330ed18 --- /dev/null +++ b/workflows/rnd_or_grid/python/p1b3_runner.py @@ -0,0 +1,60 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p1b3'] + +import json +import os +import p1b3 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework == 'keras': + import p1b3_baseline_keras2 + pkg = p1b3_baseline_keras2 + elif framework == 'mxnet': + import p1b3_baseline_mxnet + pkg = p1b3_baseline_mxnet + elif framework == 'neon': + import p1b3_baseline_neon + pkg = p1b3_baseline_neon + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + # use the last validation_loss as the value to minimize + val_loss = history.history['val_loss'] + return val_loss[-1] + +if __name__ == '__main__': + param_file = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + hyper_parameter_map = runner_utils.init(param_file, instance_directory, framework, + 'save') + # clear sys.argv so that argparse doesn't object + sys.argv = ['p1b3_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p2b1_runner.py b/workflows/rnd_or_grid/python/p2b1_runner.py new file mode 100644 index 00000000..b3583c7d --- /dev/null +++ b/workflows/rnd_or_grid/python/p2b1_runner.py @@ -0,0 +1,58 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p2b1'] + +import json +import os +import p2b1 +import runner_utils + +def run(hyper_parameter_map): + framework = hyper_parameter_map['framework'] + if framework == 'keras': + import p2b1_baseline_keras2 + pkg = p2b1_baseline_keras2 + else: + raise ValueError("Invalid framework: {}".format(framework)) + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + runner_utils.write_params(params, hyper_parameter_map) + loss_history = pkg.run(params) + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + return loss_history[-1] + +if __name__ == '__main__': + param_string = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + exp_id = sys.argv[4] + run_id = sys.argv[5] + benchmark_timeout = int(sys.argv[6]) + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, 'save_path') + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout + # clear sys.argv so that argparse doesn't object + sys.argv = ['p2b1_runner'] + result = run(hyper_parameter_map) + runner_utils.write_output(result, instance_directory) diff --git a/workflows/rnd_or_grid/python/p3b1_runner.py b/workflows/rnd_or_grid/python/p3b1_runner.py new file mode 100644 index 00000000..385d3e26 --- /dev/null +++ b/workflows/rnd_or_grid/python/p3b1_runner.py @@ -0,0 +1,97 @@ +# tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) +# so we need to create a synthetic argv. +import sys +if not hasattr(sys, 'argv'): + sys.argv = ['p3b1'] + +import json +import os +import p3b1 +import runner_utils +import socket + +node_pid = "%s,%i" % (socket.gethostname(), os.getpid()) +print("node,pid: " + node_pid) + +logger = None + +def get_logger(): + """ Set up logging """ + global logger + if logger is not None: + return logger + import logging, sys + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + h = logging.StreamHandler(stream=sys.stdout) + fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-9s %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + h.setFormatter(fmtr) + logger.addHandler(h) + return logger + +def run(hyper_parameter_map): + + logger = get_logger() + framework = hyper_parameter_map['framework'] + logger.debug("IMPORT START") + if framework == 'keras': + import p3b1_baseline_keras2 + pkg = p3b1_baseline_keras2 + else: + raise ValueError("Unsupported framework: {}".format(framework)) + logger.debug("IMPORT STOP") + + # params is python dictionary + params = pkg.initialize_parameters() + runner_utils.format_params(hyper_parameter_map) + + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + logger.debug("WRITE_PARAMS START") + runner_utils.write_params(params, hyper_parameter_map) + logger.debug("WRITE_PARAMS STOP") + logger.debug("DO_N_FOLD START") + avg_loss = pkg.do_n_fold(params) + logger.debug("DO_N_FOLD STOP") + + if framework == 'keras': + # works around this error: + # https://github.com/tensorflow/tensorflow/issues/3388 + try: + from keras import backend as K + K.clear_session() + except AttributeError: # theano does not have this function + pass + + return avg_loss + +if __name__ == '__main__': + logger = get_logger() + logger.debug("RUN START") + + param_string = sys.argv[1] + instance_directory = sys.argv[2] + framework = sys.argv[3] + exp_id = sys.argv[4] + run_id = sys.argv[5] + benchmark_timeout = int(sys.argv[6]) + + logger.debug("RUN INIT START") + + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, 'save_path') + logger.debug("RUN INIT STOP") + hyper_parameter_map['experiment_id'] = exp_id + hyper_parameter_map['run_id'] = run_id + hyper_parameter_map['timeout'] = benchmark_timeout + # clear sys.argv so that argparse doesn't object + sys.argv = ['p3b1_runner'] + result = run(hyper_parameter_map) + logger.debug("WRITE OUTPUT START") + runner_utils.write_output(result, instance_directory) + logger.debug("WRITE OUTPUT STOP") + logger.debug("RUN STOP") diff --git a/workflows/rnd_or_grid/python/test/run_test_runners.sh b/workflows/rnd_or_grid/python/test/run_test_runners.sh new file mode 100755 index 00000000..019b4824 --- /dev/null +++ b/workflows/rnd_or_grid/python/test/run_test_runners.sh @@ -0,0 +1,7 @@ +#! /usr/bin/env bash + +RUNNER_DIR=../../../../../Benchmarks/Pilot1/P1B1:../../../../../Benchmarks/Pilot2/P2B1:../../../../../Benchmarks/Pilot3/P3B1:../../../../../Benchmarks/Pilot1/NT3:../../../../../Benchmarks/Pilot1/P1B3 +export PYTHONPATH="$PWD/..:$RUNNER_DIR:../../../common/python" +echo $PYTHONPATH + +python test_runners.py \ No newline at end of file diff --git a/workflows/rnd_or_grid/python/test/run_theta_runners.sh b/workflows/rnd_or_grid/python/test/run_theta_runners.sh new file mode 100644 index 00000000..20f00a74 --- /dev/null +++ b/workflows/rnd_or_grid/python/test/run_theta_runners.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -eu + +# Theta / Tensorflow env vars +export KMP_BLOCKTIME=30 +export KMP_SETTINGS=1 +export KMP_AFFINITY=granularity=fine,verbose,compact,1,0 +export OMP_NUM_THREADS=128 + +export PYTHONHOME="/lus/theta-fs0/projects/Candle_ECP/ncollier/py2_tf_gcc6.3_eigen3_native" +#export PYTHONHOME="/home/rjain/anaconda2" +PYTHON="$PYTHONHOME/bin/python" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +export PATH="$PYTHONHOME/bin:$PATH" + +RUNNER_DIR=../../../../../Benchmarks/Pilot1/P1B1:../../../../../Benchmarks/Pilot2/P2B1:../../../../../Benchmarks/Pilot3/P3B1:../../../../../Benchmarks/Pilot1/NT3:../../../../../Benchmarks/Pilot1/P1B3 +COMMON_DIR=../../../common/python +PYTHONPATH="$PYTHONHOME/lib/python2.7:" +PYTHONPATH+="../:$RUNNER_DIR:$COMMON_DIR:" +PYTHONPATH+="$PYTHONHOME/lib/python2.7/site-packages" +export PYTHONPATH +export PROJECT=Candle_ECP + +echo $PYTHONPATH +$PYTHON test_runners.py diff --git a/workflows/rnd_or_grid/python/test/test_runners.py b/workflows/rnd_or_grid/python/test/test_runners.py new file mode 100644 index 00000000..0f2c2b14 --- /dev/null +++ b/workflows/rnd_or_grid/python/test/test_runners.py @@ -0,0 +1,62 @@ +import p1b1_runner +import p2b1_runner +import p1b3_runner +import p3b1_runner +import nt3_tc1_runner + +def main(): + + hyper_parameter_map = {'epochs' : 1} + hyper_parameter_map['framework'] = 'keras' + # hyper_parameter_map['save_path'] = save_path +# hyper_parameter_map = {'epochs' : 1} +# hyper_parameter_map['batch_size'] = 40 +# hyper_parameter_map['dense'] = [1219, 536] +# hyper_parameter_map['framework'] = 'keras' + +#1 # p1b1 + # hyper_parameter_map['save'] = './p1bl1_testing_failure' + print("STARTING#####P1B1##########") + ts_p1b1 = datetime.now() + p1b1_validation_loss = p1b1_runner.run(hyper_parameter_map) + te_p1b1 = datetime.now() + print("Validation loss=",p1b1_validation_loss) + print("DONE##########P1B1#####, TIME=", te_p1b1 - ts_p1b1) + + +#2 # p1b3 + print("STARTING#####P1B3##########") + ts_p1b3 = datetime.now() + p1b3_validation_loss = p1b3_runner.run(hyper_parameter_map) + te_p1b3 = datetime.now() + print("Validation loss=",p1b3_validation_loss) + print("DONE##########P1B3#####, TIME=", te_p1b3 - ts_p1b3) + +#3 # p2b1 + print("STARTING#####P2B1##########") + ts_p2b1 = datetime.now() + p2b1_validation_loss = p2b1_runner.run(hyper_parameter_map) + te_p2b1 = datetime.now() + print("Validation loss=",p2b1_validation_loss) + print("DONE##########P2B1#####, TIME=", te_p2b1 - ts_p2b1) + +#4 # p3b1 + print("STARTING#####P3B1##########") + ts_p3b1 = datetime.now() + p3b1_validation_loss = p3b1_runner.run(hyper_parameter_map) + te_p3b1 = datetime.now() + print("Validation loss=",p3b1_validation_loss) + print("DONE##########P3B1#####, TIME=", te_p3b1 - ts_p3b1) + +#5 # NT3 + print("STARTING#####NT3##########") + hyper_parameter_map['model_name'] = 'nt3' + ts_nt3 = datetime.now() + nt3tc1_validation_loss = nt3_tc1_runner.run(hyper_parameter_map) + te_nt3 = datetime.now() + print("Validation loss=",nt3tc1_validation_loss) + print("DONE##########NT3#####, TIME=", te_nt3 - ts_nt3) + +if __name__ == '__main__': + main() + diff --git a/workflows/rnd_or_grid/swift/computeStats.sh b/workflows/rnd_or_grid/swift/computeStats.sh new file mode 100755 index 00000000..9d2b0e25 --- /dev/null +++ b/workflows/rnd_or_grid/swift/computeStats.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python $APP_HOME/../python/computeStats.py $1 diff --git a/workflows/rnd_or_grid/swift/determineParameters.sh b/workflows/rnd_or_grid/swift/determineParameters.sh new file mode 100755 index 00000000..098d26ec --- /dev/null +++ b/workflows/rnd_or_grid/swift/determineParameters.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python $APP_HOME/../python/determineParameters.py $1 $2 $3 $4 diff --git a/workflows/rnd_or_grid/swift/evaluateOne.sh b/workflows/rnd_or_grid/swift/evaluateOne.sh new file mode 100755 index 00000000..08a72341 --- /dev/null +++ b/workflows/rnd_or_grid/swift/evaluateOne.sh @@ -0,0 +1,3 @@ +#!/bin/bash +filename=$TURBINE_OUTPUT/result-$1.txt +python -u $APP_HOME/../python/evaluateOne.py $1 $filename $3 diff --git a/workflows/rnd_or_grid/swift/rnd_or_grid.swift b/workflows/rnd_or_grid/swift/rnd_or_grid.swift new file mode 100644 index 00000000..ea050373 --- /dev/null +++ b/workflows/rnd_or_grid/swift/rnd_or_grid.swift @@ -0,0 +1,64 @@ +import string; +import files; +import io; +import sys; + +// ===== Interface definitions for the programs that we call ====== +// Random values are created from bounds specified in data/settings.json file +app (file f) +determineParameters(string settingsFilename, string benchmark, string searchType) +{ + (getenv("APP_HOME")+"/determineParameters.sh") settingsFilename f benchmark searchType; +} + +// This is where the p1b1 runner is called +app (file f) +evaluateOne(string params, string benchmark) +{ + (getenv("APP_HOME")+"/evaluateOne.sh") params f benchmark; +} + +// call this to read all the resultsFiles and compute stats +app () +computeStats(string resultsFile) +{ + (getenv("APP_HOME")+"/computeStats.sh") resultsFile; +} + +// call this to create any required directories +app (void o) make_dir(string dirname) { + "mkdir" "-p" dirname; +} + + +// ===== The program proper ============================================== +string turbine_output = getenv("TURBINE_OUTPUT"); +string app_home = getenv("APP_HOME"); +float results[string]; + +//make the experiments dir +make_dir(turbine_output); + +// Get parameters +benchmark = argv("benchmark_name"); +searchType = argv("search_type"); +inputFile = argv("input_file"); +settingsFilename = app_home+"/../data/"+inputFile; +string sweepParamFile = turbine_output+"/sweep-parameters.txt"; +file parametersFile = determineParameters(settingsFilename, benchmark, searchType); +parametersString = read(parametersFile); +parameters = split(parametersString, ":"); + +// Run experiments in parallel, passing each a different parameter set +foreach param in parameters +{ + string rName = turbine_output+"/result-"+param+".txt"; + file resultFile = evaluateOne(param, benchmark); + results[param] = string2float(read(resultFile)); +} + +// Compute stats of this array of results +// Write directly to a file with write +file tmp = write(repr(results)); +computeStats(filename(tmp)); + diff --git a/workflows/rnd_or_grid/swift/run b/workflows/rnd_or_grid/swift/run new file mode 100755 index 00000000..2b9d8d63 --- /dev/null +++ b/workflows/rnd_or_grid/swift/run @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Usage: ./run +# + +if [ "$#" -ne 4 ]; then + script_name=$(basename $0) + echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random) INPUT_JSON" + echo "Example: ./run p1b1_experiment1 p1b1 random p1b1_settings.json" + echo "-This creates a p1b1_experiment1 directory in ../experiments" + echo " uses random scheme for variables specified in ../data/p1b1_settings.json file" + exit 1 +fi + +#### set this variable to add new benchmarks directory +RUNNERS_DIR=../../../../Benchmarks/Pilot1/P1B1:../../../../Benchmarks/Pilot2/P2B1:../../../../Benchmarks/Pilot3/P3B1:../../../../Benchmarks/Pilot1/NT3:../../../../Benchmarks/Pilot1/P1B3 +### + +THIS=$( cd $( dirname $0 ); /bin/pwd ) +export APP_HOME=$THIS + +PROJECT_ROOT=$APP_HOME/.. + +export PYTHONPATH=$PYTHONPATH:$PROJECT_ROOT/python:$RUNNERS_DIR:$PROJECT_ROOT/../common/python:$PYTHONPATH + +export EXPID=$1 +B_NAME=$2 +S_NAME=$3 +JSON_F=$4 + +export TURBINE_OUTPUT=$APP_HOME/../experiments/$EXPID + + # prefix=$PWD/../data/ + # suffix="_settings.json" + # export SETTINGS_FILE=$prefix$BENCHMARK_NAME$suffix + +# TODO edit QUEUE, WALLTIME, PPN, AND TURNBINE_JOBNAME +# as required. Note that QUEUE, WALLTIME, PPN, AND TURNBINE_JOBNAME will +# be ignored if MACHINE flag (see below) is not set +export QUEUE=batch +export WALLTIME=00:10:00 +export PPN=1 +export TURBINE_JOBNAME="${EXPID}_job" + +### set the desired number of processors +PROCS=3 +### + +# Resident task workers and ranks +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +echo $PYTHONPATH + +# remove -l option for removing printing processors ranks +# settings.json file has all the parameter combinations to be tested +echo swift-t -n $PROCS $APP_HOME/grid-sweep.swift $* +swift-t -l -n $PROCS $APP_HOME/rnd_or_grid.swift $* --benchmark_name=$B_NAME --search_type=$S_NAME --input_file=$JSON_F diff --git a/workflows/rnd_or_grid/swift/theta_run.sh b/workflows/rnd_or_grid/swift/theta_run.sh new file mode 100644 index 00000000..9002ffb0 --- /dev/null +++ b/workflows/rnd_or_grid/swift/theta_run.sh @@ -0,0 +1,81 @@ +#! /usr/bin/env bash +set -eu + +# Autodetect this workflow directory +export APP_HOME=$( cd $( dirname $0 ) ; /bin/pwd ) + +#### set this variable to add new benchmarks directory +RUNNERS_DIR=$APP_HOME/../../../../Benchmarks/Pilot1/P1B1:$APP_HOME/../../../../Benchmarks/Pilot2/P2B1:$APP_HOME/../../../../Benchmarks/Pilot3/P3B1:$APP_HOME/../../../../Benchmarks/Pilot1/NT3:$APP_HOME/../../../../Benchmarks/Pilot1/P1B3 +### +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-36} +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} +export QUEUE=${QUEUE:-default} +export WALLTIME=${WALLTIME:-01:20:00} + + +if [ "$#" -ne 4 ]; then + script_name=$(basename $0) + echo "Usage: ${script_name} EXPERIMENT_ID (run1_p1b1) BENCHMARKS_NAME (eg. p1b1) SEARCH_TYPE (eg. grid or random) INPUT_JSON" + echo "Example: ./run p1b1_experiment1 p1b1 random p1b1_settings.json" + echo "-This creates a p1b1_experiment1 directory in ../experiments" + echo " uses random scheme for variables specified in ../data/p1b1_settings.json file" + exit 1 +fi + +# uncomment to turn on swift/t logging. Can also set TURBINE_LOG, +# TURBINE_DEBUG, and ADLB_DEBUG to 0 to turn off logging +export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1 + +export EXPID=$1 +export B_NAME=$2 +export S_NAME=$3 +export JSON_F=$4 + +export TURBINE_OUTPUT=$APP_HOME/../experiments/$EXPID +export PROJECT=Candle_ECP +export TURBINE_JOBNAME="${EXPID}_job" + +TCL=/home/wozniak/Public/sfw/theta/tcl-8.6.1 +export R=/home/wozniak/Public/sfw/theta/R-3.4.0/lib64/R +export PY=/home/rjain/anaconda2 +export LD_LIBRARY_PATH=$PY/lib:$R/lib:$LD_LIBRARY_PATH +COMMON_DIR=$APP_HOME/../../common/python +PYTHONPATH=$APP_HOME/../python:$RUNNERS_DIR:$COMMON_DIR +PYTHONHOME=/home/rjain/anaconda2 + +export PATH=/home/rjain/install/stc/bin:$TCL/bin:$PATH +#$PYTHONHOME/bin:$TCL/bin:$PATH + +# Resident task workers and ranks +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + + +TURBINE_DIR=/home/rjain/install/turbine/lib + +# set machine to your scheduler type (e.g. pbs, slurm, cobalt etc.), +# or empty for an immediate non-queued unscheduled run +MACHINE="theta" + +if [ -n "$MACHINE" ]; then + MACHINE="-m $MACHINE" +fi + +set -x +WORKFLOW_SWIFT=rnd_or_grid.swift +swift-t -n $PROCS $MACHINE -r $TURBINE_DIR \ + -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e APP_HOME=$APP_HOME \ + -e PYTHONPATH=$PYTHONPATH \ + -e PYTHONHOME=$PYTHONHOME \ + -e TURBINE_DEBUG=$TURBINE_DEBUG\ + -e ADLB_DEBUG=$ADLB_DEBUG \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + $APP_HOME/$WORKFLOW_SWIFT --benchmark_name=$B_NAME --search_type=$S_NAME --input_file=$JSON_F &