From 942e5a3a9c34b154fa93e7c08c0ece6f9b61bae0 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Wed, 18 Nov 2020 14:19:17 -0800 Subject: [PATCH 1/3] Add infrastructure to generate LBANN prototext from CANDLE workflow, supports for multi trainers --- workflows/lbann/.gitignore | 36 ++++ workflows/lbann/data/mnist_params.json | 67 +++++++ workflows/lbann/models/mnist/data/__init__.py | 0 .../lbann/models/mnist/data/mnist/.gitignore | 5 + .../lbann/models/mnist/data/mnist/__init__.py | 59 ++++++ .../mnist/data/mnist/data_reader.prototext | 30 +++ .../lbann/models/mnist/mnist_baseline.py | 172 ++++++++++++++++++ .../models/mnist/mnist_default_model.txt | 12 ++ .../lbann/python/generate_lbann_proto.py | 87 +++++++++ .../lbann/scripts/generate_lbann_proto.sh | 36 ++++ 10 files changed, 504 insertions(+) create mode 100644 workflows/lbann/.gitignore create mode 100644 workflows/lbann/data/mnist_params.json create mode 100644 workflows/lbann/models/mnist/data/__init__.py create mode 100644 workflows/lbann/models/mnist/data/mnist/.gitignore create mode 100644 workflows/lbann/models/mnist/data/mnist/__init__.py create mode 100644 workflows/lbann/models/mnist/data/mnist/data_reader.prototext create mode 100644 workflows/lbann/models/mnist/mnist_baseline.py create mode 100644 workflows/lbann/models/mnist/mnist_default_model.txt create mode 100644 workflows/lbann/python/generate_lbann_proto.py create mode 100755 workflows/lbann/scripts/generate_lbann_proto.sh diff --git a/workflows/lbann/.gitignore b/workflows/lbann/.gitignore new file mode 100644 index 00000000..6c739ae4 --- /dev/null +++ b/workflows/lbann/.gitignore @@ -0,0 +1,36 @@ +# Compiled Object files +*.slo +*.lo +*.o +*.x + +# Compiled Dynamic libraries +*.so + +# Compiled Static libraries +*.lai +*.la +*.a + +*~ +.DS_Store + +*.d +/.cproject +/.project +/.settings + +Debug/ +Release/ + +/output/ +scratch/ + +# ipython notebooks +*.ipynb +.ipynb_checkpoints + +# dataspace runtime conf file +conf +experiments +test_data/combo_model.h5 diff --git a/workflows/lbann/data/mnist_params.json b/workflows/lbann/data/mnist_params.json new file mode 100644 index 00000000..de511f1b --- /dev/null +++ b/workflows/lbann/data/mnist_params.json @@ -0,0 +1,67 @@ +[ + { + "name": "conv", + "type": "constant", + "value": "6 5 1 16 5 1", + "debug_value": "0 0 0" + }, + + { + "name": "classes", + "type": "constant", + "value": 10, + "comment": "debug: 1000, default: remove this entry" + }, + + { + "name": "dense", + "type": "constant", + "value": "120 84" + }, + + { + "name": "activation", + "type": "categorical", + "element_type": "string", + "values": ["relu", "elu", "relu", "tanh"] + }, + + { + "name": "optimizer", + "type": "categorical", + "element_type": "string", + "values": ["adam", "sgd", "adagrad"] + }, + { + "name": "pool_mode", + "type": "categorical", + "element_type": "string", + "values": ["max", "average"] + }, + + { + "name": "lr", + "type": "float", + "lower": 0.0001, + "upper": 0.01, + "sigma": 0.045 + }, + + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "int", + "lower": 40, + "upper": 60, + "sigma": 1 + } + +] diff --git a/workflows/lbann/models/mnist/data/__init__.py b/workflows/lbann/models/mnist/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/workflows/lbann/models/mnist/data/mnist/.gitignore b/workflows/lbann/models/mnist/data/mnist/.gitignore new file mode 100644 index 00000000..10c191aa --- /dev/null +++ b/workflows/lbann/models/mnist/data/mnist/.gitignore @@ -0,0 +1,5 @@ +*.gz +train-images-idx3-ubyte +train-labels-idx1-ubyte +t10k-images-idx3-ubyte +t10k-labels-idx1-ubyte diff --git a/workflows/lbann/models/mnist/data/mnist/__init__.py b/workflows/lbann/models/mnist/data/mnist/__init__.py new file mode 100644 index 00000000..271ccf0f --- /dev/null +++ b/workflows/lbann/models/mnist/data/mnist/__init__.py @@ -0,0 +1,59 @@ +import gzip +import os +import os.path +import urllib.request + +import google.protobuf.text_format +import lbann + +# Paths +data_dir = os.path.dirname(os.path.realpath(__file__)) + +def download_data(): + """Download MNIST data files, if needed. + + Data files are downloaded from http://yann.lecun.com/exdb/mnist/ + and uncompressed. Does nothing if the files already exist. + + """ + + # MNIST data files and associated URLs + urls = { + 'train-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', + 'train-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', + 't10k-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', + 't10k-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', + } + + # Download and uncompress MNIST data files, if needed + for data_file, url in urls.items(): + data_file = os.path.join(data_dir, data_file) + compressed_file = data_file + '.gz' + if not os.path.isfile(data_file): + urllib.request.urlretrieve(url, filename=compressed_file) + with gzip.open(compressed_file, 'rb') as in_file: + with open(data_file, 'wb') as out_file: + out_file.write(in_file.read()) + +def make_data_reader(): + """Make Protobuf message for MNIST data reader. + + MNIST data is downloaded if needed. + + """ + + # Download MNIST data files + download_data() + + # Load Protobuf message from file + protobuf_file = os.path.join(data_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set paths + for reader in message.reader: + reader.data_filedir = data_dir + + return message diff --git a/workflows/lbann/models/mnist/data/mnist/data_reader.prototext b/workflows/lbann/models/mnist/data/mnist/data_reader.prototext new file mode 100644 index 00000000..61c3b32c --- /dev/null +++ b/workflows/lbann/models/mnist/data/mnist/data_reader.prototext @@ -0,0 +1,30 @@ +data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } + reader { + name: "mnist" + role: "test" + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "t10k-images-idx3-ubyte" + label_filename: "t10k-labels-idx1-ubyte" + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } +} diff --git a/workflows/lbann/models/mnist/mnist_baseline.py b/workflows/lbann/models/mnist/mnist_baseline.py new file mode 100644 index 00000000..31e5a070 --- /dev/null +++ b/workflows/lbann/models/mnist/mnist_baseline.py @@ -0,0 +1,172 @@ +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +##LBANN stuff +import lbann +import data.mnist +import lbann.contrib.args +import lbann.contrib.launcher + +try: + import configparser +except ImportError: + import ConfigParser as configparser + + + +file_path = os.path.dirname(os.path.realpath(__file__)) + +def common_parser(parser): + + parser.add_argument("--config_file", dest='config_file', type=str, + default=os.path.join(file_path, 'mnist_default_model.txt'), + help="specify model configuration file") + + return parser + +def get_mnist_parser(): + + parser = argparse.ArgumentParser(prog='mnist_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='MNIST LBANN ') + + return common_parser(parser) + +def read_config_file(file): + #print("Reading default config (param) file : ", file) + config=configparser.ConfigParser() + config.read(file) + section=config.sections() + fileParams={} + + fileParams['model_name']=eval(config.get(section[0],'model_name')) + fileParams['conv']=eval(config.get(section[0],'conv')) + fileParams['dense']=eval(config.get(section[0],'dense')) + fileParams['activation']=eval(config.get(section[0],'activation')) + fileParams['pool_mode']=eval(config.get(section[0],'pool_mode')) + fileParams['optimizer']=eval(config.get(section[0],'optimizer')) + fileParams['epochs']=eval(config.get(section[0],'epochs')) + fileParams['batch_size']=eval(config.get(section[0],'batch_size')) + fileParams['classes']=eval(config.get(section[0],'classes')) + fileParams['save']=eval(config.get(section[0], 'save')) + fileParams['lr']=eval(config.get(section[0], 'lr')) + + return fileParams + +def initialize_parameters(): + # Get command-line parameters + parser = get_mnist_parser() + args = parser.parse_args() + # Get parameters from configuration file + gParameters = read_config_file(args.config_file) + return gParameters + +def get_activation(name, x): + if name == 'relu': + return lbann.Relu(x) + elif name == 'tanh' : + return lbann.Tanh(x) + elif name == 'elu' : + return lbann.Elu(x) + elif name == 'selu' : + return lbann.Selu(x) + elif name == 'leaky_relu' : + return lbann.LeakyRelu(x) + elif name == 'softplus' : + return lbann.Softplus(x) + + +def run(gParameters): + + #convs: out_c, conv_dim, conv_stride + conv_outc= [] + conv_dim = [] + conv_stride = [] + conv_params = list(range(0, len(gParameters['conv']), 3)) + for l, i in enumerate(conv_params): + conv_outc.append(gParameters['conv'][i]) + conv_dim.append(gParameters['conv'][i+1]) + conv_stride.append(gParameters['conv'][i+2]) + + # Input data + input_ = lbann.Input(target_mode='classification') + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + # LeNet + x = lbann.Convolution(images, + num_dims = 2, + num_output_channels = conv_outc[0], + num_groups = 1, + conv_dims_i = conv_dim[0], + conv_strides_i = conv_stride[0], + conv_dilations_i = 1, + has_bias = True) + x = get_activation(gParameters['activation'],x) + x = lbann.Pooling(x, + num_dims = 2, + pool_dims_i = 2, + pool_strides_i = 2, + pool_mode = str(gParameters['pool_mode'])) + x = lbann.Convolution(x, + num_dims = 2, + num_output_channels = conv_outc[1], + num_groups = 1, + conv_dims_i = conv_dim[1], + conv_strides_i = conv_stride[1], + conv_dilations_i = 1, + has_bias = True) + x = get_activation(gParameters['activation'],x) + x = lbann.Pooling(x, + num_dims = 2, + pool_dims_i = 2, + pool_strides_i = 2, + pool_mode = str(gParameters['pool_mode'])) + x = lbann.FullyConnected(x, num_neurons = gParameters['dense'][0], has_bias = True) + x = get_activation(gParameters['activation'],x) + x = lbann.FullyConnected(x, num_neurons = gParameters['dense'][1], has_bias = True) + x = get_activation(gParameters['activation'],x) + x = lbann.FullyConnected(x, num_neurons = gParameters['classes'], has_bias = True) + probs = lbann.Softmax(x) + + # Loss function and accuracy + loss = lbann.CrossEntropy(probs, labels) + acc = lbann.CategoricalAccuracy(probs, labels) + lr = gParameters['lr'] + opt = lbann.SGD(learn_rate=lr, momentum=0.9) + if gParameters['optimizer'] == 'adam': + opt = lbann.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, eps=1e-8) + elif gParameters['optimizer'] == 'adagrad': + opt = lbann.AdaGrad(learn_rate=lr, eps=1e-8) + + model = lbann.Model(gParameters['epochs'], + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + callbacks=[lbann.CallbackPrintModelDescription(), + lbann.CallbackPrint(), + lbann.CallbackTimer()]) + + # Setup data reader + data_reader = data.mnist.make_data_reader() + + # Setup trainer + job_name = "t"+ str(gParameters['run_id']-1) + trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size']) + status = lbann.contrib.launcher.run( + trainer, + model, + data_reader, + opt, + job_name=job_name, + setup_only = True, + ) + +def main(): + + gParameters = initialize_parameters() + run(gParameters) + +if __name__ == '__main__': + main() diff --git a/workflows/lbann/models/mnist/mnist_default_model.txt b/workflows/lbann/models/mnist/mnist_default_model.txt new file mode 100644 index 00000000..f7ba29b0 --- /dev/null +++ b/workflows/lbann/models/mnist/mnist_default_model.txt @@ -0,0 +1,12 @@ +[Global_Params] +model_name = 'mnist' +conv=[6, 5, 1, 16, 5, 1] +dense=[120,84] +activation='relu' +pool_mode='max' +optimizer='sgd' +epochs=100 +batch_size=128 +lr=0.001 +classes=10 +save='.' diff --git a/workflows/lbann/python/generate_lbann_proto.py b/workflows/lbann/python/generate_lbann_proto.py new file mode 100644 index 00000000..9ae752d1 --- /dev/null +++ b/workflows/lbann/python/generate_lbann_proto.py @@ -0,0 +1,87 @@ +import sys +import importlib +from mpi4py import MPI +import os, random, math +import runner_utils + +import ga_utils + +def import_model(framework, model_name): + module_name = "{}_baseline".format(model_name) + return importlib.import_module(module_name) + +def run(rank,hyper_parameter_map): + + + framework = hyper_parameter_map['framework'] + model_name = hyper_parameter_map['model_name'] + pkg = import_model(framework, model_name) + runner_utils.format_params(hyper_parameter_map) + + # params is python dictionary + params = pkg.initialize_parameters() + #print("Rank ", rank, " default params ", hyper_parameter_map) + #print("Rank ", rank, " params from master ", hyper_parameter_map) + for k,v in hyper_parameter_map.items(): + #if not k in params: + # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) + params[k] = v + + #write per trainer params to file + runner_utils.write_params(params, hyper_parameter_map) + sys.argv = [pkg] + pkg.run(params) + + + +def init_params(params_file, comm): + print("Rank ", comm.Get_rank(), " param files ", params_file) + param_factories = ga_utils.create_parameters(params_file) + params = [{}] + for i in range(comm.Get_size() - 1): + hyper_parameter_map = {} + for p in param_factories: + hyper_parameter_map[p.name] = p.randomDraw() + params.append(hyper_parameter_map) + + return params + +def generate_proto(rank, hyper_parameter_map, args): + + exp_dir = args[2] + instance_dir = "{}/trainer_{}/".format(exp_dir, rank) + if not os.path.exists(instance_dir): + os.makedirs(instance_dir) + + model_name = args[3] + + hyper_parameter_map['framework'] = 'lbann' + hyper_parameter_map['save'] = '{}/output'.format(instance_dir) + hyper_parameter_map['instance_directory'] = instance_dir + hyper_parameter_map['model_name'] = model_name + hyper_parameter_map['experiment_id'] = args[4] + hyper_parameter_map['run_id'] = rank + + # clear sys.argv so that argparse doesn't "cry" + sys.argv = ['lbann_runner'] + run(rank, hyper_parameter_map) + +def init_dirs(outdir): + if not os.path.exists(outdir): + os.makedirs(outdir) + +def main(args): + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + if rank == 0: + params = init_params(args[1], comm) + outdir = args[2] + init_dirs(outdir) + comm.scatter(params, root=0) + else: + params = comm.scatter(None, root=0) + generate_proto(rank, params, args) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/workflows/lbann/scripts/generate_lbann_proto.sh b/workflows/lbann/scripts/generate_lbann_proto.sh new file mode 100755 index 00000000..595b7521 --- /dev/null +++ b/workflows/lbann/scripts/generate_lbann_proto.sh @@ -0,0 +1,36 @@ +#! /usr/bin/env bash +set -eu + + +#usage ./generate_lbann_proto.sh 8 lbann_lassen_exp2 mnist_params.json +NUM_TRAINERS=$1 +EXP_ID=$2 +PARAMS_FILE=$3 + +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +ROOT="$THIS/.." +EXP_DIR="$ROOT/experiments/$EXP_ID" + +SUPERVISOR=$( cd "$PWD/../../.." ; /bin/pwd ) +LBANN_ROOT= + + +PYTHONPATH+=$SUPERVISOR/workflows/common/python +PYTHONPATH+=":$ROOT/models/mnist" +#@todo add host_name +#PYTHONPATH+=":$LBANN_ROOT/build/gnu.Release.lassen.llnl.gov.atom/install/lib/python3.7/site-packages" +PYTHONPATH+=":$LBANN_ROOT/build/gnu.Release.pascal.llnl.gov.atom/install/lib/python3.7/site-packages" + +export PYTHONPATH=$PYTHONPATH +echo $PYTHONPATH +mkdir -p $EXP_DIR +LBANN_PY="$ROOT/python/generate_lbann_proto.py" + +cp $ROOT/data/$PARAMS_FILE $EXP_DIR/ +cd $EXP_DIR +NUM_TRAINERS=$((NUM_TRAINERS + 1)) +#CMD="mpirun -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR mnist $EXP_ID" +CMD="srun --export=ALL -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR mnist $EXP_ID" +echo $CMD +$CMD +cd $THIS From 6cf8894ebc62b9f0f536975c8266a2f521538838 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 8 Dec 2020 16:13:37 -0800 Subject: [PATCH 2/3] minor update --- workflows/lbann/data/mnist_params.json | 2 +- workflows/lbann/models/mnist/mnist_baseline.py | 16 +++++++++++++--- workflows/lbann/python/generate_lbann_proto.py | 6 +++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/workflows/lbann/data/mnist_params.json b/workflows/lbann/data/mnist_params.json index de511f1b..0ed3ef12 100644 --- a/workflows/lbann/data/mnist_params.json +++ b/workflows/lbann/data/mnist_params.json @@ -52,7 +52,7 @@ "name": "batch_size", "type": "ordered", "element_type": "int", - "values": [32, 64, 128, 256, 512, 1024], + "values": [128, 128, 128, 128, 128, 128], "sigma": 1 }, diff --git a/workflows/lbann/models/mnist/mnist_baseline.py b/workflows/lbann/models/mnist/mnist_baseline.py index 31e5a070..c542f227 100644 --- a/workflows/lbann/models/mnist/mnist_baseline.py +++ b/workflows/lbann/models/mnist/mnist_baseline.py @@ -78,7 +78,7 @@ def get_activation(name, x): return lbann.Softplus(x) -def run(gParameters): +def run(gParameters,exp_dir=None): #convs: out_c, conv_dim, conv_stride conv_outc= [] @@ -146,21 +146,31 @@ def run(gParameters): metrics=[lbann.Metric(acc, name='accuracy', unit='%')], callbacks=[lbann.CallbackPrintModelDescription(), lbann.CallbackPrint(), - lbann.CallbackTimer()]) + lbann.CallbackTimer(), + lbann.CallbackLTFB(batch_interval=100,metric='accuracy')]) # Setup data reader data_reader = data.mnist.make_data_reader() # Setup trainer job_name = "t"+ str(gParameters['run_id']-1) - trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size']) + trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size'], + procs_per_trainer=0) status = lbann.contrib.launcher.run( trainer, model, data_reader, opt, + #work_dir=gParameters['save'], + work_dir=exp_dir, + nodes = 4, + #proto_file_name=job_name+"exp.prototext", + proto_file_name="experiment.prototext.trainer"+str(gParameters['run_id']-1), job_name=job_name, setup_only = True, + #batch_job = True, + lbann_args=['--generate_multi_proto --procs_per_trainer=4'] + #lbann_args=['--generate_multi_proto'] ) def main(): diff --git a/workflows/lbann/python/generate_lbann_proto.py b/workflows/lbann/python/generate_lbann_proto.py index 9ae752d1..1a7765b7 100644 --- a/workflows/lbann/python/generate_lbann_proto.py +++ b/workflows/lbann/python/generate_lbann_proto.py @@ -10,7 +10,7 @@ def import_model(framework, model_name): module_name = "{}_baseline".format(model_name) return importlib.import_module(module_name) -def run(rank,hyper_parameter_map): +def run(rank,hyper_parameter_map,exp_dir): framework = hyper_parameter_map['framework'] @@ -30,7 +30,7 @@ def run(rank,hyper_parameter_map): #write per trainer params to file runner_utils.write_params(params, hyper_parameter_map) sys.argv = [pkg] - pkg.run(params) + pkg.run(params,exp_dir) @@ -64,7 +64,7 @@ def generate_proto(rank, hyper_parameter_map, args): # clear sys.argv so that argparse doesn't "cry" sys.argv = ['lbann_runner'] - run(rank, hyper_parameter_map) + run(rank, hyper_parameter_map,exp_dir) def init_dirs(outdir): if not os.path.exists(outdir): From 5e3a96d4df6edb07f72d8c6a7ed6f13a884fe61b Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 5 Jan 2021 11:05:25 -0800 Subject: [PATCH 3/3] More clean and broad support --- .../lbann/models/mnist/mnist_baseline.py | 31 ++++++++++--------- .../lbann/python/generate_lbann_proto.py | 6 ++-- .../lbann/scripts/generate_lbann_proto.sh | 29 +++++++++++------ 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/workflows/lbann/models/mnist/mnist_baseline.py b/workflows/lbann/models/mnist/mnist_baseline.py index c542f227..e03eedcf 100644 --- a/workflows/lbann/models/mnist/mnist_baseline.py +++ b/workflows/lbann/models/mnist/mnist_baseline.py @@ -24,15 +24,16 @@ def common_parser(parser): parser.add_argument("--config_file", dest='config_file', type=str, default=os.path.join(file_path, 'mnist_default_model.txt'), help="specify model configuration file") + parser.add_argument("--nodes", type=int, default=8) return parser -def get_mnist_parser(): +def get_model_parser(): parser = argparse.ArgumentParser(prog='mnist_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='MNIST LBANN ') - return common_parser(parser) + return common_parser(parser).parse_args() def read_config_file(file): #print("Reading default config (param) file : ", file) @@ -46,7 +47,7 @@ def read_config_file(file): fileParams['dense']=eval(config.get(section[0],'dense')) fileParams['activation']=eval(config.get(section[0],'activation')) fileParams['pool_mode']=eval(config.get(section[0],'pool_mode')) - fileParams['optimizer']=eval(config.get(section[0],'optimizer')) + #fileParams['optimizer']=eval(config.get(section[0],'optimizer')) fileParams['epochs']=eval(config.get(section[0],'epochs')) fileParams['batch_size']=eval(config.get(section[0],'batch_size')) fileParams['classes']=eval(config.get(section[0],'classes')) @@ -55,10 +56,10 @@ def read_config_file(file): return fileParams -def initialize_parameters(): +def initialize_parameters(args): # Get command-line parameters - parser = get_mnist_parser() - args = parser.parse_args() + #args = get_model_parser() + #args = parser.parse_args() # Get parameters from configuration file gParameters = read_config_file(args.config_file) return gParameters @@ -78,7 +79,7 @@ def get_activation(name, x): return lbann.Softplus(x) -def run(gParameters,exp_dir=None): +def run(gParameters,run_args,exp_dir=None): #convs: out_c, conv_dim, conv_stride conv_outc= [] @@ -135,27 +136,28 @@ def run(gParameters,exp_dir=None): acc = lbann.CategoricalAccuracy(probs, labels) lr = gParameters['lr'] opt = lbann.SGD(learn_rate=lr, momentum=0.9) + ##Uncomment to support optimizer exchange + ''' if gParameters['optimizer'] == 'adam': opt = lbann.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, eps=1e-8) elif gParameters['optimizer'] == 'adagrad': opt = lbann.AdaGrad(learn_rate=lr, eps=1e-8) - + ''' model = lbann.Model(gParameters['epochs'], layers=lbann.traverse_layer_graph(input_), objective_function=loss, metrics=[lbann.Metric(acc, name='accuracy', unit='%')], callbacks=[lbann.CallbackPrintModelDescription(), lbann.CallbackPrint(), - lbann.CallbackTimer(), - lbann.CallbackLTFB(batch_interval=100,metric='accuracy')]) + lbann.CallbackTimer()]) + #lbann.CallbackLTFB(batch_interval=100,metric='accuracy')]) # Setup data reader data_reader = data.mnist.make_data_reader() # Setup trainer job_name = "t"+ str(gParameters['run_id']-1) - trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size'], - procs_per_trainer=0) + trainer = lbann.Trainer(name=job_name, mini_batch_size=gParameters['batch_size']) status = lbann.contrib.launcher.run( trainer, model, @@ -163,7 +165,7 @@ def run(gParameters,exp_dir=None): opt, #work_dir=gParameters['save'], work_dir=exp_dir, - nodes = 4, + nodes=run_args.nodes, #proto_file_name=job_name+"exp.prototext", proto_file_name="experiment.prototext.trainer"+str(gParameters['run_id']-1), job_name=job_name, @@ -175,7 +177,8 @@ def run(gParameters,exp_dir=None): def main(): - gParameters = initialize_parameters() + args = get_model_parser() + gParameters = initialize_parameters(args) run(gParameters) if __name__ == '__main__': diff --git a/workflows/lbann/python/generate_lbann_proto.py b/workflows/lbann/python/generate_lbann_proto.py index 1a7765b7..81bf02f1 100644 --- a/workflows/lbann/python/generate_lbann_proto.py +++ b/workflows/lbann/python/generate_lbann_proto.py @@ -19,7 +19,9 @@ def run(rank,hyper_parameter_map,exp_dir): runner_utils.format_params(hyper_parameter_map) # params is python dictionary - params = pkg.initialize_parameters() + run_args = pkg.get_model_parser() + sys.argv = [pkg] + params = pkg.initialize_parameters(run_args) #print("Rank ", rank, " default params ", hyper_parameter_map) #print("Rank ", rank, " params from master ", hyper_parameter_map) for k,v in hyper_parameter_map.items(): @@ -30,7 +32,7 @@ def run(rank,hyper_parameter_map,exp_dir): #write per trainer params to file runner_utils.write_params(params, hyper_parameter_map) sys.argv = [pkg] - pkg.run(params,exp_dir) + pkg.run(params,run_args, exp_dir) diff --git a/workflows/lbann/scripts/generate_lbann_proto.sh b/workflows/lbann/scripts/generate_lbann_proto.sh index 595b7521..2683e9bf 100755 --- a/workflows/lbann/scripts/generate_lbann_proto.sh +++ b/workflows/lbann/scripts/generate_lbann_proto.sh @@ -2,24 +2,35 @@ set -eu -#usage ./generate_lbann_proto.sh 8 lbann_lassen_exp2 mnist_params.json +#usage ./generate_lbann_proto.sh 8 lbann_lassen_exp2 mnist_params.json mnist NUM_TRAINERS=$1 EXP_ID=$2 PARAMS_FILE=$3 +MODEL_NAME=$4 THIS=$( cd $( dirname $0 ) ; /bin/pwd ) ROOT="$THIS/.." EXP_DIR="$ROOT/experiments/$EXP_ID" SUPERVISOR=$( cd "$PWD/../../.." ; /bin/pwd ) -LBANN_ROOT= - - -PYTHONPATH+=$SUPERVISOR/workflows/common/python +LBANN_ROOT=/usr/workspace/wsa/jacobs32/git.samadejacobs.lbann + +#export SPACK_ROOT=/usr/workspace/wsa/jacobs32/git.samadejacobs.lbann/build/spack; .$SPACK_ROOT/share/spack/setup-env.sh +export SPACK_ROOT=$LBANN_ROOT/build/spack; . $SPACK_ROOT/share/spack/setup-env.sh +spack env activate -p lbann-dev-power9le +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${cuDNN_ROOT}/lib" +#module use $LBANN_ROOT/build/spack.gnu.Release.lassen.llnl.gov/install/etc/modulefiles +module use $LBANN_ROOT/build/gnu.Release.lassen.llnl.gov/install/etc/modulefiles +module load lbann-0.102.0 + +#PYTHONPATH+=$SUPERVISOR/workflows/common/python +PYTHONPATH+=":$SUPERVISOR/workflows/common/python" PYTHONPATH+=":$ROOT/models/mnist" +PYTHONPATH+=":$ROOT/models/atom" #@todo add host_name +#PYTHONPATH+=":$LBANN_ROOT/build/spack.gnu.Release.lassen.llnl.gov/install/lib/python3.7/site-packages" #PYTHONPATH+=":$LBANN_ROOT/build/gnu.Release.lassen.llnl.gov.atom/install/lib/python3.7/site-packages" -PYTHONPATH+=":$LBANN_ROOT/build/gnu.Release.pascal.llnl.gov.atom/install/lib/python3.7/site-packages" +#PYTHONPATH+=":$LBANN_ROOT/build/gnu.Release.pascal.llnl.gov.atom/install/lib/python3.7/site-packages" export PYTHONPATH=$PYTHONPATH echo $PYTHONPATH @@ -29,8 +40,8 @@ LBANN_PY="$ROOT/python/generate_lbann_proto.py" cp $ROOT/data/$PARAMS_FILE $EXP_DIR/ cd $EXP_DIR NUM_TRAINERS=$((NUM_TRAINERS + 1)) -#CMD="mpirun -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR mnist $EXP_ID" -CMD="srun --export=ALL -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR mnist $EXP_ID" +CMD="mpirun -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR $MODEL_NAME $EXP_ID" +#CMD="srun --export=ALL -n $NUM_TRAINERS python3 -u $LBANN_PY $PARAMS_FILE $EXP_DIR mnist $EXP_ID" echo $CMD $CMD -cd $THIS +#cd $THIS